RoBERTa-classifier / inference-roberta.py
shainar's picture
Upload 2 files
25d9bf7 verified
import pandas as pd
import torch
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from torch.nn.functional import softmax
import os
class RoBERTaClassifier:
def __init__(self, model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
self.model.classifier.dropout = torch.nn.Dropout(dropout_rate)
self.model.to(self.device)
self.max_length = max_length
def load_model(self, model_dir):
# Load the model using from_pretrained which includes the configuration and model weights
self.model = RobertaForSequenceClassification.from_pretrained(model_dir)
self.model.to(self.device) # Make sure to also send the model to the correct device
def predict(self, texts):
self.model.eval()
with torch.no_grad():
encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=self.max_length, return_tensors='pt').to(self.device)
outputs = self.model(**encodings)
logits = outputs.logits
probabilities = softmax(logits, dim=1)
predictions = torch.argmax(logits, dim=1)
return predictions, probabilities
# Example of usage
model_folder = 'saved_models'
model_dir = os.path.join(model_folder, 'best_model-roberta')
classifier = RoBERTaClassifier(model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2)
classifier.load_model(model_dir)
test_data = pd.read_csv('News_Bias_Samples.csv')
texts = test_data['text'].tolist()
predictions, probabilities = classifier.predict(texts)
threshold = 0.5
predicted_labels = ['unbiased' if prob[1] >= threshold else 'biased' for prob in probabilities.cpu().numpy()]
results_df = pd.DataFrame({
'Text': texts,
'Predicted Label': predicted_labels,
'Probability': [prob[1].item() for prob in probabilities],
'Ground Truth': test_data['label']
})
results_df['Match'] = results_df['Predicted Label'] == results_df['Ground Truth']
matches = results_df['Match'].sum()
total = len(results_df)
accuracy = matches / total
results_df.to_csv('prediction_results-roberta.csv', index=False)
print(f'Total matches: {matches}/{total} ({accuracy:.2%} accuracy)')