import pandas as pd import torch from transformers import RobertaTokenizerFast, RobertaForSequenceClassification from torch.nn.functional import softmax import os class RoBERTaClassifier: def __init__(self, model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = RobertaTokenizerFast.from_pretrained(model_name) self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) self.model.classifier.dropout = torch.nn.Dropout(dropout_rate) self.model.to(self.device) self.max_length = max_length def load_model(self, model_dir): # Load the model using from_pretrained which includes the configuration and model weights self.model = RobertaForSequenceClassification.from_pretrained(model_dir) self.model.to(self.device) # Make sure to also send the model to the correct device def predict(self, texts): self.model.eval() with torch.no_grad(): encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=self.max_length, return_tensors='pt').to(self.device) outputs = self.model(**encodings) logits = outputs.logits probabilities = softmax(logits, dim=1) predictions = torch.argmax(logits, dim=1) return predictions, probabilities # Example of usage model_folder = 'saved_models' model_dir = os.path.join(model_folder, 'best_model-roberta') classifier = RoBERTaClassifier(model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2) classifier.load_model(model_dir) test_data = pd.read_csv('News_Bias_Samples.csv') texts = test_data['text'].tolist() predictions, probabilities = classifier.predict(texts) threshold = 0.5 predicted_labels = ['unbiased' if prob[1] >= threshold else 'biased' for prob in probabilities.cpu().numpy()] results_df = pd.DataFrame({ 'Text': texts, 'Predicted Label': predicted_labels, 'Probability': [prob[1].item() for prob in probabilities], 'Ground Truth': test_data['label'] }) results_df['Match'] = results_df['Predicted Label'] == results_df['Ground Truth'] matches = results_df['Match'].sum() total = len(results_df) accuracy = matches / total results_df.to_csv('prediction_results-roberta.csv', index=False) print(f'Total matches: {matches}/{total} ({accuracy:.2%} accuracy)')