POLLCHECK
/

RoBERTa-classifier

Text Classification

Inference Endpoints

Model card Files Files and versions Community

RoBERTa-classifier / inference-roberta.py

shainar's picture

Upload 2 files

25d9bf7 verified 4 months ago

history blame contribute delete

2.48 kB

	import pandas as pd
	import torch
	from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
	from torch.nn.functional import softmax
	import os

	class RoBERTaClassifier:
	def __init__(self, model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2):
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
	self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
	self.model.classifier.dropout = torch.nn.Dropout(dropout_rate)
	self.model.to(self.device)
	self.max_length = max_length

	def load_model(self, model_dir):
	# Load the model using from_pretrained which includes the configuration and model weights
	self.model = RobertaForSequenceClassification.from_pretrained(model_dir)
	self.model.to(self.device) # Make sure to also send the model to the correct device

	def predict(self, texts):
	self.model.eval()
	with torch.no_grad():
	encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=self.max_length, return_tensors='pt').to(self.device)
	outputs = self.model(**encodings)
	logits = outputs.logits
	probabilities = softmax(logits, dim=1)
	predictions = torch.argmax(logits, dim=1)
	return predictions, probabilities

	# Example of usage
	model_folder = 'saved_models'
	model_dir = os.path.join(model_folder, 'best_model-roberta')
	classifier = RoBERTaClassifier(model_name='roberta-base', num_labels=2, max_length=256, dropout_rate=0.2)
	classifier.load_model(model_dir)

	test_data = pd.read_csv('News_Bias_Samples.csv')
	texts = test_data['text'].tolist()
	predictions, probabilities = classifier.predict(texts)
	threshold = 0.5
	predicted_labels = ['unbiased' if prob[1] >= threshold else 'biased' for prob in probabilities.cpu().numpy()]

	results_df = pd.DataFrame({
	'Text': texts,
	'Predicted Label': predicted_labels,
	'Probability': [prob[1].item() for prob in probabilities],
	'Ground Truth': test_data['label']
	})

	results_df['Match'] = results_df['Predicted Label'] == results_df['Ground Truth']
	matches = results_df['Match'].sum()
	total = len(results_df)
	accuracy = matches / total

	results_df.to_csv('prediction_results-roberta.csv', index=False)
	print(f'Total matches: {matches}/{total} ({accuracy:.2%} accuracy)')