Spaces:
Runtime error
Runtime error
MorenoLaQuatra
commited on
Commit
•
f12a60c
1
Parent(s):
5884700
Initial commit
Browse files- __pycache__/dual_regression_model.cpython-310.pyc +0 -0
- app.py +62 -0
- clf_model/.gitattributes +34 -0
- clf_model/config.json +71 -0
- clf_model/pytorch_model.bin +3 -0
- clf_model/special_tokens_map.json +7 -0
- clf_model/tokenizer.json +0 -0
- clf_model/tokenizer_config.json +17 -0
- clf_model/vocab.txt +0 -0
- dual_regression_model.py +94 -0
- reg_model/regression_model.pt +3 -0
- reg_model/special_tokens_map.json +7 -0
- reg_model/tokenizer.json +0 -0
- reg_model/tokenizer_config.json +14 -0
- reg_model/vocab.txt +0 -0
- requirements.txt +4 -0
__pycache__/dual_regression_model.cpython-310.pyc
ADDED
Binary file (2.84 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from dual_regression_model import DualRegressionModel
|
4 |
+
import transformers
|
5 |
+
from transformers import pipeline
|
6 |
+
from functools import partial
|
7 |
+
|
8 |
+
# load the models
|
9 |
+
# CLF: A-pt-bs16-dbmdz-bert-base-italian-cased
|
10 |
+
clf_model_tag = "clf_model/"
|
11 |
+
clf_tokenizer = transformers.AutoTokenizer.from_pretrained(clf_model_tag)
|
12 |
+
clf_model = transformers.AutoModelForSequenceClassification.from_pretrained(clf_model_tag)
|
13 |
+
clf_pipeline = pipeline("text-classification", model=clf_model, tokenizer=clf_tokenizer)
|
14 |
+
|
15 |
+
# REG
|
16 |
+
reg_model_tag = "distilbert-base-multilingual-cased"
|
17 |
+
reg_model_folder = "reg_model/regression_model.pt"
|
18 |
+
reg_model = DualRegressionModel(model_name_or_path=reg_model_tag)
|
19 |
+
reg_model.load_model(reg_model_folder)
|
20 |
+
|
21 |
+
|
22 |
+
# define the function to be used for prediction
|
23 |
+
def predict(text):
|
24 |
+
# predict the class
|
25 |
+
clf_prediction = clf_pipeline(text)[0]
|
26 |
+
# predict the coordinates
|
27 |
+
reg_input = reg_model.tokenizer(text, return_tensors="pt")
|
28 |
+
reg_prediction = reg_model(reg_input)
|
29 |
+
latitude, longitude = reg_prediction["latitude"].item(), reg_prediction["longitude"].item()
|
30 |
+
lat_min = 38
|
31 |
+
lat_max = 46
|
32 |
+
long_min = 8
|
33 |
+
long_max = 18
|
34 |
+
# return the results
|
35 |
+
html_output = f"<h3>The identified region is: {clf_prediction['label']}</h3>"
|
36 |
+
# plot points on the map of Italy
|
37 |
+
html_output += f'<h3>Predicted point on map:</h3><p>Latitude: {latitude}</p><p>Longitude: {longitude}</p>'
|
38 |
+
html_output += f'<iframe width="425" height="350" frameborder="0" scrolling="no" marginheight="0" marginwidth="0" src="https://www.openstreetmap.org/export/embed.html?bbox={long_min}%2C{lat_min}%2C{long_max}%2C{lat_max}&layer=mapnik&marker={latitude}%2C{longitude}" style="border: 1px solid black"></iframe><br/><small><a href="https://www.openstreetmap.org/#map=13/{latitude}/{longitude}">Visualizza mappa ingrandita</a></small>'
|
39 |
+
|
40 |
+
return html_output
|
41 |
+
|
42 |
+
# --------------------------------------------------------------------------------------------
|
43 |
+
# Gradio interface
|
44 |
+
# --------------------------------------------------------------------------------------------
|
45 |
+
|
46 |
+
# define the interface
|
47 |
+
iface = gr.Interface(
|
48 |
+
fn=predict,
|
49 |
+
inputs=gr.Textbox(lines=2, placeholder="Insert the text here..."),
|
50 |
+
outputs=gr.HTML(),
|
51 |
+
title="DANTE: Dialect ANalysis TEam",
|
52 |
+
description="This is a demo of a classification and regression model for locating the italian dialect of a given text.",
|
53 |
+
examples=[
|
54 |
+
["Bisognerebbe saperli materializzare .... !! Ma ovviamente .. belin .... NO SE PEU SCIUSCIA' E SCIORBI'"],
|
55 |
+
["Guaglio' Buongiorno! Azz! Vir te si scurdat puparuol e mulignane pero '!! E che se fa😑"],
|
56 |
+
["Il massimo...ghe ne minga par nisun"],
|
57 |
+
["Che poi a me la tuta piace na cifra da vede. Subisco un po' lo stigma sociale che noi con la fregna dovemo stà sempre apposto.",]
|
58 |
+
]
|
59 |
+
)
|
60 |
+
|
61 |
+
# launch the interface
|
62 |
+
iface.launch()
|
clf_model/.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
clf_model/config.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "best_ft_models_a/PT/16BS/dbmdz-bert-base-italian-cased/best_model/",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 768,
|
11 |
+
"id2label": {
|
12 |
+
"0": "Abruzzo",
|
13 |
+
"1": "Basilicata",
|
14 |
+
"2": "Calabria",
|
15 |
+
"3": "Campania",
|
16 |
+
"4": "Emilia Romagna",
|
17 |
+
"5": "Friuli-Venezia Giulia",
|
18 |
+
"6": "Lazio",
|
19 |
+
"7": "Liguria",
|
20 |
+
"8": "Lombardia",
|
21 |
+
"9": "Marche",
|
22 |
+
"10": "Molise",
|
23 |
+
"11": "Piemonte",
|
24 |
+
"12": "Puglia",
|
25 |
+
"13": "Sardegna",
|
26 |
+
"14": "Sicilia",
|
27 |
+
"15": "Toscana",
|
28 |
+
"16": "Trentino-Alto Adige",
|
29 |
+
"17": "Umbria",
|
30 |
+
"18": "Valle d'Aosta",
|
31 |
+
"19": "Veneto"
|
32 |
+
},
|
33 |
+
"initializer_range": 0.02,
|
34 |
+
"intermediate_size": 3072,
|
35 |
+
"label2id": {
|
36 |
+
"Abruzzo": 0,
|
37 |
+
"Basilicata": 1,
|
38 |
+
"Calabria": 2,
|
39 |
+
"Campania": 3,
|
40 |
+
"Emilia Romagna": 4,
|
41 |
+
"Friuli-Venezia Giulia": 5,
|
42 |
+
"Lazio": 6,
|
43 |
+
"Liguria": 7,
|
44 |
+
"Lombardia": 8,
|
45 |
+
"Marche": 9,
|
46 |
+
"Molise": 10,
|
47 |
+
"Piemonte": 11,
|
48 |
+
"Puglia": 12,
|
49 |
+
"Sardegna": 13,
|
50 |
+
"Sicilia": 14,
|
51 |
+
"Toscana": 15,
|
52 |
+
"Trentino-Alto Adige": 16,
|
53 |
+
"Umbria": 17,
|
54 |
+
"Valle d'Aosta": 18,
|
55 |
+
"Veneto": 19
|
56 |
+
},
|
57 |
+
"layer_norm_eps": 1e-12,
|
58 |
+
"max_position_embeddings": 512,
|
59 |
+
"model_type": "bert",
|
60 |
+
"num_attention_heads": 12,
|
61 |
+
"num_hidden_layers": 12,
|
62 |
+
"output_hidden_states": true,
|
63 |
+
"pad_token_id": 0,
|
64 |
+
"position_embedding_type": "absolute",
|
65 |
+
"problem_type": "single_label_classification",
|
66 |
+
"torch_dtype": "float32",
|
67 |
+
"transformers_version": "4.26.1",
|
68 |
+
"type_vocab_size": 2,
|
69 |
+
"use_cache": true,
|
70 |
+
"vocab_size": 31102
|
71 |
+
}
|
clf_model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b9f16887a3e26030e3d57905976b8eb36a6e5846b1047fbabde05b62adc605b
|
3 |
+
size 439842229
|
clf_model/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
clf_model/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clf_model/tokenizer_config.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_basic_tokenize": true,
|
4 |
+
"do_lower_case": false,
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"max_len": 512,
|
7 |
+
"model_max_length": 512,
|
8 |
+
"name_or_path": "best_ft_models_a/PT/16BS/dbmdz-bert-base-italian-cased/best_model/",
|
9 |
+
"never_split": null,
|
10 |
+
"pad_token": "[PAD]",
|
11 |
+
"sep_token": "[SEP]",
|
12 |
+
"special_tokens_map_file": null,
|
13 |
+
"strip_accents": null,
|
14 |
+
"tokenize_chinese_chars": true,
|
15 |
+
"tokenizer_class": "BertTokenizer",
|
16 |
+
"unk_token": "[UNK]"
|
17 |
+
}
|
clf_model/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dual_regression_model.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
|
5 |
+
|
6 |
+
|
7 |
+
class DualRegressionModel(nn.Module):
|
8 |
+
def __init__(
|
9 |
+
self,
|
10 |
+
model_name_or_path: str = "camembert/camembert-base",
|
11 |
+
loss_aggreatation: str = "mean",
|
12 |
+
):
|
13 |
+
"""
|
14 |
+
This class instantiates the pre-training model.
|
15 |
+
:param model_name_or_path: The name or path of the model to be used for pre-training.
|
16 |
+
"""
|
17 |
+
|
18 |
+
super().__init__()
|
19 |
+
if "bart" in model_name_or_path:
|
20 |
+
self.model = AutoModel.from_pretrained(
|
21 |
+
model_name_or_path, output_hidden_states=True
|
22 |
+
)
|
23 |
+
self.model = self.model.encoder
|
24 |
+
else:
|
25 |
+
self.model = AutoModelForMaskedLM.from_pretrained(
|
26 |
+
model_name_or_path, output_hidden_states=True
|
27 |
+
)
|
28 |
+
|
29 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
30 |
+
self.loss_aggreatation = loss_aggreatation
|
31 |
+
|
32 |
+
# create two different regression heads for two tasks (latitude and longitude)
|
33 |
+
self.lat_regression_head = torch.nn.Linear(self.model.config.hidden_size, 1)
|
34 |
+
self.long_regression_head = torch.nn.Linear(self.model.config.hidden_size, 1)
|
35 |
+
|
36 |
+
self.crierion = torch.nn.MSELoss()
|
37 |
+
|
38 |
+
def forward(
|
39 |
+
self,
|
40 |
+
batch,
|
41 |
+
):
|
42 |
+
"""
|
43 |
+
This function is called to compute the loss for the specified task.
|
44 |
+
:param batch: The batch of data.
|
45 |
+
"""
|
46 |
+
predict = not batch.keys() & {"longitude", "latitude"}
|
47 |
+
|
48 |
+
input_ids = batch["input_ids"]
|
49 |
+
attention_mask = batch["attention_mask"]
|
50 |
+
if not predict:
|
51 |
+
latitudes = batch["latitude"]
|
52 |
+
longitudes = batch["longitude"]
|
53 |
+
|
54 |
+
# get the last hidden state
|
55 |
+
last_hidden_state = self.model(
|
56 |
+
input_ids=input_ids,
|
57 |
+
attention_mask=attention_mask,
|
58 |
+
).hidden_states[-1][:, 0, :]
|
59 |
+
|
60 |
+
lat_predictions = self.lat_regression_head(last_hidden_state)
|
61 |
+
long_predictions = self.long_regression_head(last_hidden_state)
|
62 |
+
|
63 |
+
result = {"latitude": lat_predictions, "longitude": long_predictions}
|
64 |
+
|
65 |
+
if not predict:
|
66 |
+
lat_loss = self.crierion(lat_predictions.squeeze(), latitudes)
|
67 |
+
long_loss = self.crierion(long_predictions.squeeze(), longitudes)
|
68 |
+
|
69 |
+
if self.loss_aggreatation == "mean":
|
70 |
+
loss = (lat_loss + long_loss) / 2
|
71 |
+
elif self.loss_aggreatation == "sum":
|
72 |
+
loss = lat_loss + long_loss
|
73 |
+
else:
|
74 |
+
raise ValueError("Only mean and sum are supported for loss aggregation")
|
75 |
+
result |= {"loss": loss}
|
76 |
+
|
77 |
+
return result
|
78 |
+
|
79 |
+
def save_model(self, path):
|
80 |
+
"""
|
81 |
+
This function is called to save the model to a specified path. E.g. "model.pt"
|
82 |
+
:param path: The path where the model is saved.
|
83 |
+
"""
|
84 |
+
|
85 |
+
torch.save(self.state_dict(), path)
|
86 |
+
|
87 |
+
def load_model(self, path):
|
88 |
+
"""
|
89 |
+
This function is called to load the model.
|
90 |
+
:param path: The path where the model is saved. E.g. "model.pt"
|
91 |
+
"""
|
92 |
+
|
93 |
+
# load the state dict
|
94 |
+
self.load_state_dict(torch.load(path))
|
reg_model/regression_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e477574615af95cc93b7bcb8173d7c000cf6a5c2d172d7ad4557a3b396652cd
|
3 |
+
size 541834318
|
reg_model/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
reg_model/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
reg_model/tokenizer_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_lower_case": false,
|
4 |
+
"mask_token": "[MASK]",
|
5 |
+
"model_max_length": 512,
|
6 |
+
"name_or_path": "DGMS/distilbert-base-multilingual-cased-dialect",
|
7 |
+
"pad_token": "[PAD]",
|
8 |
+
"sep_token": "[SEP]",
|
9 |
+
"special_tokens_map_file": null,
|
10 |
+
"strip_accents": null,
|
11 |
+
"tokenize_chinese_chars": true,
|
12 |
+
"tokenizer_class": "DistilBertTokenizer",
|
13 |
+
"unk_token": "[UNK]"
|
14 |
+
}
|
reg_model/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
gradio
|
4 |
+
numpy
|