tomsoderlund's picture
merge_split_tokens
e746d10
raw
history blame
1.47 kB
import gradio
from transformers import pipeline
# Merge split tokens starting with '##'
def merge_split_tokens(tokens):
merged_tokens = []
for token in tokens:
if token["word"].startswith('##'):
merged_tokens[-1]["word"] += token["word"][2:]
else:
merged_tokens.append(token)
return merged_tokens
def process_swedish_text(text):
# Models from https://huggingface.co/models
# https://huggingface.co/KBLab/bert-base-swedish-cased-ner
nlp = pipeline('ner', model='KBLab/bert-base-swedish-cased-ner', tokenizer='KBLab/bert-base-swedish-cased-ner')
# Run NER
nlp_results = nlp(text)
print('nlp_results:', nlp_results)
nlp_results_merged = merge_split_tokens(nlp_results)
# Fix TypeError("'numpy.float32' object is not iterable")
nlp_results_adjusted = map(lambda entity: dict(entity, **{ 'score': float(entity['score']) }), nlp_results_merged)
print('nlp_results_adjusted:', nlp_results_adjusted)
# Return values
return {'entities': list(nlp_results_adjusted)}
gradio_interface = gradio.Interface(
fn=process_swedish_text,
inputs="text",
outputs="json",
examples=[
["Jag heter Tom och bor i Stockholm."],
["Groens malmgård är en av Stockholms malmgårdar, belägen vid Malmgårdsvägen 53 på Södermalm i Stockholm."]
],
title="Swedish Entity Recognition",
description="Recognizing Swedish tokens e.g. locations and person names.",
article="© Tom Söderlund 2022"
)
gradio_interface.launch()