{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LlamaForSequenceClassification(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(49153, 576, padding_idx=49152)\n", " (layers): ModuleList(\n", " (0-29): 30 x LlamaDecoderLayer(\n", " (self_attn): LlamaSdpaAttention(\n", " (q_proj): Linear(in_features=576, out_features=576, bias=False)\n", " (k_proj): Linear(in_features=576, out_features=192, bias=False)\n", " (v_proj): Linear(in_features=576, out_features=192, bias=False)\n", " (o_proj): Linear(in_features=576, out_features=576, bias=False)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(in_features=576, out_features=1536, bias=False)\n", " (up_proj): Linear(in_features=576, out_features=1536, bias=False)\n", " (down_proj): Linear(in_features=1536, out_features=576, bias=False)\n", " (act_fn): SiLU()\n", " )\n", " (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)\n", " (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)\n", " )\n", " )\n", " (norm): LlamaRMSNorm((576,), eps=1e-05)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (score): Linear(in_features=576, out_features=2, bias=False)\n", ")" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import GPT2Tokenizer, LlamaForSequenceClassification\n", "\n", "# Load the GPT2 tokenizer and Llama model for sequence classification\n", "model_path = r\"C:\\Users\\jatin\\OneDrive\\Desktop\\plagiarism-detection\\smolLM-fined-tuned-for-PLAGAIRISM-Detection\\model\"\n", "tokenizer = GPT2Tokenizer.from_pretrained(model_path, local_files_only=True)\n", "model = LlamaForSequenceClassification.from_pretrained(model_path, local_files_only=True)\n", "\n", "# Set model to evaluation mode\n", "model.eval()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentence1sentence2label
0A person on a horse jumps over a broken down a...A person is at a diner, ordering an omelette.0
1A person on a horse jumps over a broken down a...A person is outdoors, on a horse.1
2Children smiling and waving at cameraThere are children present1
3Children smiling and waving at cameraThe kids are frowning0
4A boy is jumping on skateboard in the middle o...The boy skates down the sidewalk.0
\n", "
" ], "text/plain": [ " sentence1 \\\n", "0 A person on a horse jumps over a broken down a... \n", "1 A person on a horse jumps over a broken down a... \n", "2 Children smiling and waving at camera \n", "3 Children smiling and waving at camera \n", "4 A boy is jumping on skateboard in the middle o... \n", "\n", " sentence2 label \n", "0 A person is at a diner, ordering an omelette. 0 \n", "1 A person is outdoors, on a horse. 1 \n", "2 There are children present 1 \n", "3 The kids are frowning 0 \n", "4 The boy skates down the sidewalk. 0 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "import pandas as pd\n", "\n", "df = pd.read_csv(\"train_snli.txt\", delimiter='\\t', header=None, names=['sentence1', 'sentence2', 'label'])\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import Dataset, DataLoader\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "class PlagiarismDataset(Dataset):\n", " def __init__(self, df, tokenizer, max_length=128):\n", " self.df = df\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", "\n", " def __len__(self):\n", " return len(self.df)\n", "\n", " def __getitem__(self, index):\n", " row = self.df.iloc[index]\n", "\n", " # Ensure the sentences are strings; convert or skip if not\n", " sentence1 = str(row['sentence1']) if not pd.isna(row['sentence1']) else \"\"\n", " sentence2 = str(row['sentence2']) if not pd.isna(row['sentence2']) else \"\"\n", "\n", " inputs = self.tokenizer(\n", " sentence1, sentence2,\n", " add_special_tokens=True,\n", " max_length=self.max_length,\n", " padding='max_length',\n", " truncation=True,\n", " return_tensors=\"pt\"\n", " )\n", "\n", " label = torch.tensor(row['label'], dtype=torch.long)\n", "\n", " return {\n", " 'input_ids': inputs['input_ids'].squeeze(0),\n", " 'attention_mask': inputs['attention_mask'].squeeze(0),\n", " 'label': label\n", " }\n", "\n", "def collate_fn(batch):\n", " input_ids = torch.stack([item['input_ids'] for item in batch])\n", " attention_masks = torch.stack([item['attention_mask'] for item in batch])\n", " labels = torch.stack([item['label'] for item in batch])\n", "\n", " return {\n", " 'input_ids': input_ids,\n", " 'attention_mask': attention_masks,\n", " 'label': labels\n", " }" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "device(type='cuda')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "device" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Assuming you have a separate test set or validation set (e.g., df_test)\n", "df_test = df[3_66_900:]\n", "# Add padding token if not already\n", "tokenizer.add_special_tokens({'pad_token': '[PAD]'})\n", "\n", "# Resize the model's token embeddings to fit the new tokenizer\n", "model.resize_token_embeddings(len(tokenizer))\n", "\n", "# Create DataLoader for the test set\n", "test_dataset = PlagiarismDataset(df_test, tokenizer)\n", "test_data_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 236\n", " 1 1.00 1.00 1.00 237\n", "\n", " accuracy 1.00 473\n", " macro avg 1.00 1.00 1.00 473\n", "weighted avg 1.00 1.00 1.00 473\n", "\n" ] } ], "source": [ "from sklearn.metrics import classification_report\n", "# Function to evaluate model on the test set\n", "# Set up device\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# Move model to the appropriate device\n", "model = model.to(device)\n", "\n", "# Function to evaluate the model\n", "def evaluate_model(model, data_loader):\n", " model.eval() # Set model to evaluation mode\n", " preds_list = []\n", " labels_list = []\n", "\n", " with torch.no_grad(): # Disable gradient calculation for evaluation\n", " for batch in data_loader:\n", " # Move input tensors to the same device as the model\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", " \n", " # Get model outputs\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " preds = torch.argmax(outputs.logits, dim=1)\n", "\n", " # Append predictions and true labels to respective lists\n", " preds_list.extend(preds.cpu().numpy())\n", " labels_list.extend(labels.cpu().numpy())\n", " \n", " # Compute evaluation metrics\n", " from sklearn.metrics import classification_report\n", " report = classification_report(labels_list, preds_list)\n", " print(\"Classification Report:\\n\", report)\n", "\n", "# Evaluate the model\n", "evaluate_model(model, test_data_loader)" ] } ], "metadata": { "kernelspec": { "display_name": "LLM", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.20" } }, "nbformat": 4, "nbformat_minor": 2 }