Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
-
|
|
|
2 |
|
|
|
3 |
import torch
|
4 |
from datasets import load_dataset
|
5 |
from console_logging.console import Console
|
@@ -9,8 +11,6 @@ from transformers import TrainingArguments, Trainer
|
|
9 |
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
|
10 |
from transformers import EvalPrediction
|
11 |
import torch
|
12 |
-
import gradio as gr
|
13 |
-
|
14 |
console = Console()
|
15 |
|
16 |
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
|
@@ -112,7 +112,8 @@ def compute_metrics(p: EvalPrediction):
|
|
112 |
labels=p.label_ids)
|
113 |
return result
|
114 |
|
115 |
-
|
|
|
116 |
model,
|
117 |
args,
|
118 |
train_dataset=encoded_dataset["train"],
|
@@ -120,9 +121,126 @@ def compute_metrics(p: EvalPrediction):
|
|
120 |
tokenizer=tokenizer,
|
121 |
compute_metrics=compute_metrics
|
122 |
)
|
|
|
|
|
|
|
|
|
123 |
"""
|
124 |
|
125 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
text_ = "Bitcoin to the moon"
|
128 |
model = torch.load("./model.pt", map_location=torch.device('cpu'))
|
@@ -168,7 +286,7 @@ with demo:
|
|
168 |
""")
|
169 |
inp = [gr.Textbox(label='Text or tweet text', placeholder="Insert text")]
|
170 |
out = gr.Textbox(label='Output')
|
171 |
-
text_button = gr.Button("
|
172 |
text_button.click(predict, inputs=inp, outputs=out)
|
173 |
|
174 |
|
|
|
1 |
+
""" CODE TO TRY IN COLAB
|
2 |
+
!pip install -q transformers datasets torch gradio console_logging numpy
|
3 |
|
4 |
+
import gradio as gr
|
5 |
import torch
|
6 |
from datasets import load_dataset
|
7 |
from console_logging.console import Console
|
|
|
11 |
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
|
12 |
from transformers import EvalPrediction
|
13 |
import torch
|
|
|
|
|
14 |
console = Console()
|
15 |
|
16 |
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
|
|
|
112 |
labels=p.label_ids)
|
113 |
return result
|
114 |
|
115 |
+
|
116 |
+
trainer = Trainer(
|
117 |
model,
|
118 |
args,
|
119 |
train_dataset=encoded_dataset["train"],
|
|
|
121 |
tokenizer=tokenizer,
|
122 |
compute_metrics=compute_metrics
|
123 |
)
|
124 |
+
|
125 |
+
trainer.train()
|
126 |
+
|
127 |
+
trainer.evaluate()
|
128 |
"""
|
129 |
|
130 |
+
# Version to gradio and HuggingFace, doesn't works like the colab version, this version use the exported model, possible without the fine tuning
|
131 |
+
|
132 |
+
import torch
|
133 |
+
from datasets import load_dataset
|
134 |
+
from console_logging.console import Console
|
135 |
+
import numpy as np
|
136 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
137 |
+
from transformers import TrainingArguments, Trainer
|
138 |
+
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
|
139 |
+
from transformers import EvalPrediction
|
140 |
+
import torch
|
141 |
+
import gradio as gr
|
142 |
+
|
143 |
+
console = Console()
|
144 |
+
|
145 |
+
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
|
146 |
+
|
147 |
+
|
148 |
+
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
|
149 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
150 |
+
|
151 |
+
#labels = [label for label in dataset['train'].features.keys() if label not in ['text']]
|
152 |
+
|
153 |
+
labels = ["Bearish", "Bullish", "Neutral"]
|
154 |
+
|
155 |
+
def preprocess_data(examples):
|
156 |
+
# take a batch of texts
|
157 |
+
text = examples["text"]
|
158 |
+
# encode them
|
159 |
+
encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
|
160 |
+
# add labels
|
161 |
+
#labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
|
162 |
+
labels_batch = {'Bearish': [], 'Bullish': [], 'Neutral': []}
|
163 |
+
for i in range (len(examples['label'])):
|
164 |
+
labels_batch["Bearish"].append(False)
|
165 |
+
labels_batch["Bullish"].append(False)
|
166 |
+
labels_batch["Neutral"].append(False)
|
167 |
+
|
168 |
+
if examples['label'][i] == 0:
|
169 |
+
labels_batch["Bearish"][i] = True
|
170 |
+
|
171 |
+
elif examples['label'][i] == 1:
|
172 |
+
labels_batch["Bullish"][i] = True
|
173 |
+
|
174 |
+
else:
|
175 |
+
labels_batch["Neutral"][i] = True
|
176 |
+
|
177 |
+
# create numpy array of shape (batch_size, num_labels)
|
178 |
+
labels_matrix = np.zeros((len(text), len(labels)))
|
179 |
+
# fill numpy array
|
180 |
+
for idx, label in enumerate(labels):
|
181 |
+
labels_matrix[:, idx] = labels_batch[label]
|
182 |
+
|
183 |
+
encoding["labels"] = labels_matrix.tolist()
|
184 |
+
|
185 |
+
return encoding
|
186 |
+
|
187 |
+
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
|
188 |
+
|
189 |
+
encoded_dataset.set_format("torch")
|
190 |
+
|
191 |
+
id2label = {idx:label for idx, label in enumerate(labels)}
|
192 |
+
label2id = {label:idx for idx, label in enumerate(labels)}
|
193 |
+
|
194 |
+
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
|
195 |
+
problem_type="multi_label_classification",
|
196 |
+
num_labels=len(labels),
|
197 |
+
id2label=id2label,
|
198 |
+
label2id=label2id)
|
199 |
+
|
200 |
+
batch_size = 8
|
201 |
+
metric_name = "f1"
|
202 |
+
|
203 |
+
args = TrainingArguments(
|
204 |
+
f"bert-finetuned-sem_eval-english",
|
205 |
+
evaluation_strategy = "epoch",
|
206 |
+
save_strategy = "epoch",
|
207 |
+
learning_rate=2e-5,
|
208 |
+
per_device_train_batch_size=batch_size,
|
209 |
+
per_device_eval_batch_size=batch_size,
|
210 |
+
num_train_epochs=5,
|
211 |
+
weight_decay=0.01,
|
212 |
+
load_best_model_at_end=True,
|
213 |
+
metric_for_best_model=metric_name,
|
214 |
+
#push_to_hub=True,
|
215 |
+
)
|
216 |
+
|
217 |
+
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
|
218 |
+
def multi_label_metrics(predictions, labels, threshold=0.5):
|
219 |
+
# first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
|
220 |
+
sigmoid = torch.nn.Sigmoid()
|
221 |
+
probs = sigmoid(torch.Tensor(predictions))
|
222 |
+
# next, use threshold to turn them into integer predictions
|
223 |
+
y_pred = np.zeros(probs.shape)
|
224 |
+
y_pred[np.where(probs >= threshold)] = 1
|
225 |
+
# finally, compute metrics
|
226 |
+
y_true = labels
|
227 |
+
f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
|
228 |
+
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
|
229 |
+
accuracy = accuracy_score(y_true, y_pred)
|
230 |
+
# return as dictionary
|
231 |
+
metrics = {'f1': f1_micro_average,
|
232 |
+
'roc_auc': roc_auc,
|
233 |
+
'accuracy': accuracy}
|
234 |
+
return metrics
|
235 |
+
|
236 |
+
def compute_metrics(p: EvalPrediction):
|
237 |
+
preds = p.predictions[0] if isinstance(p.predictions,
|
238 |
+
tuple) else p.predictions
|
239 |
+
result = multi_label_metrics(
|
240 |
+
predictions=preds,
|
241 |
+
labels=p.label_ids)
|
242 |
+
return result
|
243 |
+
|
244 |
|
245 |
text_ = "Bitcoin to the moon"
|
246 |
model = torch.load("./model.pt", map_location=torch.device('cpu'))
|
|
|
286 |
""")
|
287 |
inp = [gr.Textbox(label='Text or tweet text', placeholder="Insert text")]
|
288 |
out = gr.Textbox(label='Output')
|
289 |
+
text_button = gr.Button("Get the text sentiment")
|
290 |
text_button.click(predict, inputs=inp, outputs=out)
|
291 |
|
292 |
|