Spaces:

Berbex
/

FinalProject

Runtime error

App Files Files Community

Berbex commited on Dec 9, 2022

Commit

fe33dd6

•

1 Parent(s): 2de0a92

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -6

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
-#!pip install -q transformers datasets torch gradio console_logging numpy
 import torch
 from datasets import load_dataset
 from console_logging.console import Console
@@ -9,8 +11,6 @@ from transformers import TrainingArguments, Trainer
 from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
 from transformers import EvalPrediction
 import torch
-import gradio as gr
 console = Console()
 dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
@@ -112,7 +112,8 @@ def compute_metrics(p: EvalPrediction):
         labels=p.label_ids)
     return result
-"""trainer = Trainer(
     model,
     args,
     train_dataset=encoded_dataset["train"],
@@ -120,9 +121,126 @@ def compute_metrics(p: EvalPrediction):
     tokenizer=tokenizer,
     compute_metrics=compute_metrics
 )
 """
-# REMOVE THIS IN COLAB #############
 text_ = "Bitcoin to the moon"
 model = torch.load("./model.pt", map_location=torch.device('cpu'))
@@ -168,7 +286,7 @@ with demo:
     """)
     inp = [gr.Textbox(label='Text or tweet text', placeholder="Insert text")]
     out = gr.Textbox(label='Output')
-    text_button = gr.Button("Flip")
     text_button.click(predict, inputs=inp, outputs=out)

+""" CODE TO TRY IN COLAB
+!pip install -q transformers datasets torch gradio console_logging numpy
+import gradio as gr
 import torch
 from datasets import load_dataset
 from console_logging.console import Console
 from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
 from transformers import EvalPrediction
 import torch
 console = Console()
 dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
         labels=p.label_ids)
     return result
+trainer = Trainer(
     model,
     args,
     train_dataset=encoded_dataset["train"],
     tokenizer=tokenizer,
     compute_metrics=compute_metrics
 )
+trainer.train()
+trainer.evaluate()
 """
+# Version to gradio and HuggingFace, doesn't works like the colab version, this version use the exported model, possible without the fine tuning
+import torch
+from datasets import load_dataset
+from console_logging.console import Console
+import numpy as np
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from transformers import TrainingArguments, Trainer
+from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
+from transformers import EvalPrediction
+import torch
+import gradio as gr
+console = Console()
+dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
+model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+#labels = [label for label in dataset['train'].features.keys() if label not in ['text']]
+labels = ["Bearish", "Bullish", "Neutral"]
+def preprocess_data(examples):
+  # take a batch of texts
+  text = examples["text"]
+  # encode them
+  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
+  # add labels
+  #labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
+  labels_batch = {'Bearish': [], 'Bullish': [], 'Neutral': []}
+  for i in range (len(examples['label'])):
+    labels_batch["Bearish"].append(False)
+    labels_batch["Bullish"].append(False)
+    labels_batch["Neutral"].append(False)
+    if examples['label'][i] == 0:
+      labels_batch["Bearish"][i] = True
+    elif examples['label'][i] == 1:
+      labels_batch["Bullish"][i] = True
+    else:
+      labels_batch["Neutral"][i] = True
+  # create numpy array of shape (batch_size, num_labels)
+  labels_matrix = np.zeros((len(text), len(labels)))
+  # fill numpy array
+  for idx, label in enumerate(labels):
+    labels_matrix[:, idx] = labels_batch[label]
+  encoding["labels"] = labels_matrix.tolist()
+  return encoding
+encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
+encoded_dataset.set_format("torch")
+id2label = {idx:label for idx, label in enumerate(labels)}
+label2id = {label:idx for idx, label in enumerate(labels)}
+model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
+                                                           problem_type="multi_label_classification",
+                                                           num_labels=len(labels),
+                                                           id2label=id2label,
+                                                           label2id=label2id)
+batch_size = 8
+metric_name = "f1"
+args = TrainingArguments(
+    f"bert-finetuned-sem_eval-english",
+    evaluation_strategy = "epoch",
+    save_strategy = "epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    num_train_epochs=5,
+    weight_decay=0.01,
+    load_best_model_at_end=True,
+    metric_for_best_model=metric_name,
+    #push_to_hub=True,
+)
+# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
+def multi_label_metrics(predictions, labels, threshold=0.5):
+    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
+    sigmoid = torch.nn.Sigmoid()
+    probs = sigmoid(torch.Tensor(predictions))
+    # next, use threshold to turn them into integer predictions
+    y_pred = np.zeros(probs.shape)
+    y_pred[np.where(probs >= threshold)] = 1
+    # finally, compute metrics
+    y_true = labels
+    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
+    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
+    accuracy = accuracy_score(y_true, y_pred)
+    # return as dictionary
+    metrics = {'f1': f1_micro_average,
+               'roc_auc': roc_auc,
+               'accuracy': accuracy}
+    return metrics
+def compute_metrics(p: EvalPrediction):
+    preds = p.predictions[0] if isinstance(p.predictions,
+            tuple) else p.predictions
+    result = multi_label_metrics(
+        predictions=preds,
+        labels=p.label_ids)
+    return result
 text_ = "Bitcoin to the moon"
 model = torch.load("./model.pt", map_location=torch.device('cpu'))
     """)
     inp = [gr.Textbox(label='Text or tweet text', placeholder="Insert text")]
     out = gr.Textbox(label='Output')
+    text_button = gr.Button("Get the text sentiment")
     text_button.click(predict, inputs=inp, outputs=out)