|
import gradio as gr |
|
from scipy.spatial.distance import cosine |
|
from transformers import AutoModel, AutoTokenizer |
|
from argparse import Namespace |
|
import torch |
|
from tsne import TSNE_Plot |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert") |
|
model_args = Namespace(do_mlm=None, |
|
pooler_type="cls", |
|
temp=0.05, |
|
mlp_only_train=False, |
|
init_embeddings_model=None) |
|
model = AutoModel.from_pretrained("silk-road/luotuo-bert", |
|
trust_remote_code=True, |
|
model_args=model_args) |
|
|
|
def divide_str(s, sep=['\n', '.', '。']): |
|
mid_len = len(s) // 2 |
|
best_sep_pos = len(s) + 1 |
|
best_sep = None |
|
for curr_sep in sep: |
|
sep_pos = s.rfind(curr_sep, 0, mid_len) |
|
if sep_pos > 0 and abs(sep_pos - mid_len) < abs(best_sep_pos - mid_len): |
|
best_sep_pos = sep_pos |
|
best_sep = curr_sep |
|
if not best_sep: |
|
return s, '' |
|
return s[:best_sep_pos + 1], s[best_sep_pos + 1:] |
|
|
|
def strong_divide( s ): |
|
left, right = divide_str(s) |
|
|
|
if right != '': |
|
return left, right |
|
|
|
whole_sep = ['\n', '.', ',', '、', ';', ',', ';',\ |
|
':', '!', '?', '(', ')', '”', '“', \ |
|
'’', '‘', '[', ']', '{', '}', '<', '>', \ |
|
'/', '''\''', '|', '-', '=', '+', '*', '%', \ |
|
'$', ''' |
|
'·', '…'] |
|
left, right = divide_str(s, sep = whole_sep ) |
|
|
|
if right != '': |
|
return left, right |
|
|
|
mid_len = len(s) // 2 |
|
return s[:mid_len], s[mid_len:] |
|
|
|
def generate_image(text_input): |
|
|
|
text_input = text_input.split('\n') |
|
label = [] |
|
for idx, i in enumerate(text_input): |
|
if '#' in i: |
|
label.append(i[i.find('#') + 1:]) |
|
text_input[idx] = i[:i.find('#')] |
|
else: |
|
label.append('No.{}'.format(idx)) |
|
|
|
divided_text = [strong_divide(i) for i in text_input] |
|
text_left, text_right = [i[0] for i in divided_text], [i[1] for i in divided_text] |
|
inputs = tokenizer(text_left, padding=True, truncation=True, return_tensors="pt") |
|
with torch.no_grad(): |
|
embeddings_left = model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output |
|
inputs = tokenizer(text_right, padding=True, truncation=True, return_tensors="pt") |
|
with torch.no_grad(): |
|
embeddings_right = model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output |
|
|
|
merged_list = text_left + text_right |
|
merged_embed = torch.cat((embeddings_left, embeddings_right), dim=0) |
|
tsne_plot = TSNE_Plot(merged_list, merged_embed, label=label * 2, n_annotation_positions=len(merged_list)) |
|
fig = tsne_plot.tsne_plot(n_sentence=len(merged_list), return_fig=True) |
|
return fig |
|
|
|
with gr.Blocks() as demo: |
|
name = gr.inputs.Textbox(lines=20, |
|
placeholder='在此输入歌词,每一行为一个输入,如果需要输入歌词对应的歌名,请用#隔开\n例如:听雷声 滚滚 他默默 闭紧嘴唇 停止吟唱暮色与想念 他此刻沉痛而危险 听雷声 滚滚 他渐渐 感到胸闷 乌云阻拦明月涌河湾 他起身独立向荒原#河北墨麒麟') |
|
output = gr.Plot() |
|
btn = gr.Button("Generate") |
|
btn.click(fn=generate_image, inputs=name, outputs=output, api_name="generate-image") |
|
|
|
demo.launch(debug=True) |