tomsoderlund commited on
Commit
e746d10
1 Parent(s): 2e6b605

merge_split_tokens

Browse files
Files changed (2) hide show
  1. README.md +7 -15
  2. app.py +16 -2
README.md CHANGED
@@ -9,28 +9,20 @@ python_version: 3.9.13
9
  app_file: app.py
10
  pinned: false
11
  license: openrail
12
- models: ["KBLab/bert-base-swedish-cased-ner"]
 
13
  ---
14
 
15
  # Swedish Entity Recognition
16
 
17
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
18
-
19
- ## Preparing a Gradio app for Huggingface Spaces
20
 
21
  Setup:
22
 
23
- # Create a “safe” virtual Python environment
24
- python3 -m venv env
25
- # Install Gradio
26
- pip3 install gradio
27
- # Install optional packages for your specific app: pip3 install torch transformers
28
- # Update the list of required packages
29
- pip3 freeze > requirements.txt
30
- # Create a blank app.py
31
- touch app.py
32
-
33
- Edit `app.py`, then run:
34
 
35
  python3 app.py
36
 
 
9
  app_file: app.py
10
  pinned: false
11
  license: openrail
12
+ models:
13
+ - KBLab/bert-base-swedish-cased-ner
14
  ---
15
 
16
  # Swedish Entity Recognition
17
 
18
+ ## Installing locally
 
 
19
 
20
  Setup:
21
 
22
+ source env/bin/activate
23
+ pip3 install -r requirements.txt
24
+
25
+ Then run:
 
 
 
 
 
 
 
26
 
27
  python3 app.py
28
 
app.py CHANGED
@@ -1,6 +1,16 @@
1
  import gradio
2
  from transformers import pipeline
3
 
 
 
 
 
 
 
 
 
 
 
4
  def process_swedish_text(text):
5
  # Models from https://huggingface.co/models
6
  # https://huggingface.co/KBLab/bert-base-swedish-cased-ner
@@ -8,8 +18,9 @@ def process_swedish_text(text):
8
  # Run NER
9
  nlp_results = nlp(text)
10
  print('nlp_results:', nlp_results)
 
11
  # Fix TypeError("'numpy.float32' object is not iterable")
12
- nlp_results_adjusted = map(lambda entity: dict(entity, **{ 'score': float(entity['score']) }), nlp_results)
13
  print('nlp_results_adjusted:', nlp_results_adjusted)
14
  # Return values
15
  return {'entities': list(nlp_results_adjusted)}
@@ -21,6 +32,9 @@ gradio_interface = gradio.Interface(
21
  examples=[
22
  ["Jag heter Tom och bor i Stockholm."],
23
  ["Groens malmgård är en av Stockholms malmgårdar, belägen vid Malmgårdsvägen 53 på Södermalm i Stockholm."]
24
- ]
 
 
 
25
  )
26
  gradio_interface.launch()
 
1
  import gradio
2
  from transformers import pipeline
3
 
4
+ # Merge split tokens starting with '##'
5
+ def merge_split_tokens(tokens):
6
+ merged_tokens = []
7
+ for token in tokens:
8
+ if token["word"].startswith('##'):
9
+ merged_tokens[-1]["word"] += token["word"][2:]
10
+ else:
11
+ merged_tokens.append(token)
12
+ return merged_tokens
13
+
14
  def process_swedish_text(text):
15
  # Models from https://huggingface.co/models
16
  # https://huggingface.co/KBLab/bert-base-swedish-cased-ner
 
18
  # Run NER
19
  nlp_results = nlp(text)
20
  print('nlp_results:', nlp_results)
21
+ nlp_results_merged = merge_split_tokens(nlp_results)
22
  # Fix TypeError("'numpy.float32' object is not iterable")
23
+ nlp_results_adjusted = map(lambda entity: dict(entity, **{ 'score': float(entity['score']) }), nlp_results_merged)
24
  print('nlp_results_adjusted:', nlp_results_adjusted)
25
  # Return values
26
  return {'entities': list(nlp_results_adjusted)}
 
32
  examples=[
33
  ["Jag heter Tom och bor i Stockholm."],
34
  ["Groens malmgård är en av Stockholms malmgårdar, belägen vid Malmgårdsvägen 53 på Södermalm i Stockholm."]
35
+ ],
36
+ title="Swedish Entity Recognition",
37
+ description="Recognizing Swedish tokens e.g. locations and person names.",
38
+ article="© Tom Söderlund 2022"
39
  )
40
  gradio_interface.launch()