File size: 2,480 Bytes

25d9bf7

import pandas as pd
import torch
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from torch.nn.functional import softmax
import os

class RoBERTaClassifier:
    def __init__(self, model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
        self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.model.classifier.dropout = torch.nn.Dropout(dropout_rate)
        self.model.to(self.device)
        self.max_length = max_length

    def load_model(self, model_dir):
        # Load the model using from_pretrained which includes the configuration and model weights
        self.model = RobertaForSequenceClassification.from_pretrained(model_dir)
        self.model.to(self.device)  # Make sure to also send the model to the correct device

    def predict(self, texts):
        self.model.eval()
        with torch.no_grad():
            encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=self.max_length, return_tensors='pt').to(self.device)
            outputs = self.model(**encodings)
            logits = outputs.logits
            probabilities = softmax(logits, dim=1)
            predictions = torch.argmax(logits, dim=1)
            return predictions, probabilities

# Example of usage
model_folder = 'saved_models'
model_dir = os.path.join(model_folder, 'best_model-roberta')
classifier = RoBERTaClassifier(model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2)
classifier.load_model(model_dir)

test_data = pd.read_csv('News_Bias_Samples.csv')
texts = test_data['text'].tolist()
predictions, probabilities = classifier.predict(texts)
threshold = 0.5
predicted_labels = ['unbiased' if prob[1] >= threshold else 'biased' for prob in probabilities.cpu().numpy()]

results_df = pd.DataFrame({
    'Text': texts,
    'Predicted Label': predicted_labels,
    'Probability': [prob[1].item() for prob in probabilities],
    'Ground Truth': test_data['label']
})

results_df['Match'] = results_df['Predicted Label'] == results_df['Ground Truth']
matches = results_df['Match'].sum()
total = len(results_df)
accuracy = matches / total

results_df.to_csv('prediction_results-roberta.csv', index=False)
print(f'Total matches: {matches}/{total} ({accuracy:.2%} accuracy)')