tomsoderlund
commited on
Commit
•
e746d10
1
Parent(s):
2e6b605
merge_split_tokens
Browse files
README.md
CHANGED
@@ -9,28 +9,20 @@ python_version: 3.9.13
|
|
9 |
app_file: app.py
|
10 |
pinned: false
|
11 |
license: openrail
|
12 |
-
models:
|
|
|
13 |
---
|
14 |
|
15 |
# Swedish Entity Recognition
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
## Preparing a Gradio app for Huggingface Spaces
|
20 |
|
21 |
Setup:
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
# Install optional packages for your specific app: pip3 install torch transformers
|
28 |
-
# Update the list of required packages
|
29 |
-
pip3 freeze > requirements.txt
|
30 |
-
# Create a blank app.py
|
31 |
-
touch app.py
|
32 |
-
|
33 |
-
Edit `app.py`, then run:
|
34 |
|
35 |
python3 app.py
|
36 |
|
|
|
9 |
app_file: app.py
|
10 |
pinned: false
|
11 |
license: openrail
|
12 |
+
models:
|
13 |
+
- KBLab/bert-base-swedish-cased-ner
|
14 |
---
|
15 |
|
16 |
# Swedish Entity Recognition
|
17 |
|
18 |
+
## Installing locally
|
|
|
|
|
19 |
|
20 |
Setup:
|
21 |
|
22 |
+
source env/bin/activate
|
23 |
+
pip3 install -r requirements.txt
|
24 |
+
|
25 |
+
Then run:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
python3 app.py
|
28 |
|
app.py
CHANGED
@@ -1,6 +1,16 @@
|
|
1 |
import gradio
|
2 |
from transformers import pipeline
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
def process_swedish_text(text):
|
5 |
# Models from https://huggingface.co/models
|
6 |
# https://huggingface.co/KBLab/bert-base-swedish-cased-ner
|
@@ -8,8 +18,9 @@ def process_swedish_text(text):
|
|
8 |
# Run NER
|
9 |
nlp_results = nlp(text)
|
10 |
print('nlp_results:', nlp_results)
|
|
|
11 |
# Fix TypeError("'numpy.float32' object is not iterable")
|
12 |
-
nlp_results_adjusted = map(lambda entity: dict(entity, **{ 'score': float(entity['score']) }),
|
13 |
print('nlp_results_adjusted:', nlp_results_adjusted)
|
14 |
# Return values
|
15 |
return {'entities': list(nlp_results_adjusted)}
|
@@ -21,6 +32,9 @@ gradio_interface = gradio.Interface(
|
|
21 |
examples=[
|
22 |
["Jag heter Tom och bor i Stockholm."],
|
23 |
["Groens malmgård är en av Stockholms malmgårdar, belägen vid Malmgårdsvägen 53 på Södermalm i Stockholm."]
|
24 |
-
]
|
|
|
|
|
|
|
25 |
)
|
26 |
gradio_interface.launch()
|
|
|
1 |
import gradio
|
2 |
from transformers import pipeline
|
3 |
|
4 |
+
# Merge split tokens starting with '##'
|
5 |
+
def merge_split_tokens(tokens):
|
6 |
+
merged_tokens = []
|
7 |
+
for token in tokens:
|
8 |
+
if token["word"].startswith('##'):
|
9 |
+
merged_tokens[-1]["word"] += token["word"][2:]
|
10 |
+
else:
|
11 |
+
merged_tokens.append(token)
|
12 |
+
return merged_tokens
|
13 |
+
|
14 |
def process_swedish_text(text):
|
15 |
# Models from https://huggingface.co/models
|
16 |
# https://huggingface.co/KBLab/bert-base-swedish-cased-ner
|
|
|
18 |
# Run NER
|
19 |
nlp_results = nlp(text)
|
20 |
print('nlp_results:', nlp_results)
|
21 |
+
nlp_results_merged = merge_split_tokens(nlp_results)
|
22 |
# Fix TypeError("'numpy.float32' object is not iterable")
|
23 |
+
nlp_results_adjusted = map(lambda entity: dict(entity, **{ 'score': float(entity['score']) }), nlp_results_merged)
|
24 |
print('nlp_results_adjusted:', nlp_results_adjusted)
|
25 |
# Return values
|
26 |
return {'entities': list(nlp_results_adjusted)}
|
|
|
32 |
examples=[
|
33 |
["Jag heter Tom och bor i Stockholm."],
|
34 |
["Groens malmgård är en av Stockholms malmgårdar, belägen vid Malmgårdsvägen 53 på Södermalm i Stockholm."]
|
35 |
+
],
|
36 |
+
title="Swedish Entity Recognition",
|
37 |
+
description="Recognizing Swedish tokens e.g. locations and person names.",
|
38 |
+
article="© Tom Söderlund 2022"
|
39 |
)
|
40 |
gradio_interface.launch()
|