|
import pandas as pd |
|
import torch |
|
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification |
|
from torch.nn.functional import softmax |
|
import os |
|
|
|
class RoBERTaClassifier: |
|
def __init__(self, model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2): |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
self.tokenizer = RobertaTokenizerFast.from_pretrained(model_name) |
|
self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) |
|
self.model.classifier.dropout = torch.nn.Dropout(dropout_rate) |
|
self.model.to(self.device) |
|
self.max_length = max_length |
|
|
|
def load_model(self, model_dir): |
|
|
|
self.model = RobertaForSequenceClassification.from_pretrained(model_dir) |
|
self.model.to(self.device) |
|
|
|
def predict(self, texts): |
|
self.model.eval() |
|
with torch.no_grad(): |
|
encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=self.max_length, return_tensors='pt').to(self.device) |
|
outputs = self.model(**encodings) |
|
logits = outputs.logits |
|
probabilities = softmax(logits, dim=1) |
|
predictions = torch.argmax(logits, dim=1) |
|
return predictions, probabilities |
|
|
|
|
|
model_folder = 'saved_models' |
|
model_dir = os.path.join(model_folder, 'best_model-roberta') |
|
classifier = RoBERTaClassifier(model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2) |
|
classifier.load_model(model_dir) |
|
|
|
test_data = pd.read_csv('News_Bias_Samples.csv') |
|
texts = test_data['text'].tolist() |
|
predictions, probabilities = classifier.predict(texts) |
|
threshold = 0.5 |
|
predicted_labels = ['unbiased' if prob[1] >= threshold else 'biased' for prob in probabilities.cpu().numpy()] |
|
|
|
results_df = pd.DataFrame({ |
|
'Text': texts, |
|
'Predicted Label': predicted_labels, |
|
'Probability': [prob[1].item() for prob in probabilities], |
|
'Ground Truth': test_data['label'] |
|
}) |
|
|
|
results_df['Match'] = results_df['Predicted Label'] == results_df['Ground Truth'] |
|
matches = results_df['Match'].sum() |
|
total = len(results_df) |
|
accuracy = matches / total |
|
|
|
results_df.to_csv('prediction_results-roberta.csv', index=False) |
|
print(f'Total matches: {matches}/{total} ({accuracy:.2%} accuracy)') |
|
|