import gradio as gr import torch import pandas as pd import bitsandbytes as bnb import evaluate import numpy as np import random import huggingface_hub import os from datasets import Dataset, DatasetDict, load_dataset from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model from transformers import (AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoModelForSequenceClassification, Trainer, EarlyStoppingCallback, DataCollatorWithPadding) from huggingface_hub import login def process(model_id, dataset): # Step 1: Load dataset dataset_imdb = load_dataset(dataset) # Step 2: Reduce dataset (optional) reduction_rate = 0.1 num_train_to_keep = int(reduction_rate * dataset_imdb["train"].num_rows) num_test_to_keep = int(reduction_rate * dataset_imdb["test"].num_rows) def select_random_indices(dataset, num_to_keep): indices = list(range(dataset.num_rows)) random.shuffle(indices) return indices[:num_to_keep] train_indices = select_random_indices(dataset_imdb["train"], num_train_to_keep) test_indices = select_random_indices(dataset_imdb["test"], num_test_to_keep) dataset_imdb = DatasetDict({ "train": dataset_imdb["train"].select(train_indices), "test": dataset_imdb["test"].select(test_indices), }) # Step 3: Text tokenization def preprocess_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") # Step 4: Apply tokenization to dataset tokenized_imdb = dataset_imdb.map(preprocess_function, batched=True) #Step 5: Fine-tune the model login(token=os.environ.get("HF_TOKEN")) model_id = model_id model = AutoModelForSequenceClassification.from_pretrained(model_id) lora_config = LoraConfig(task="sequence_classification") peft_model = PeftModel(model, lora_config) training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=64, evaluation_strategy="epoch", learning_rate=1e-5, save_total_limit=2, save_steps=500, load_best_model_at_end=True, metric_for_best_model="accuracy", greater_is_better=True, save_strategy="steps", eval_accumulation_steps=10, ) trainer = Trainer( model=peft_model, args=training_args, train_dataset=tokenized_imdb["train"], eval_dataset=tokenized_imdb["test"], compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1)).item()}, data_collator=DataCollatorWithPadding(tokenizer=tokenizer), ) trainer.train() # Step 6: Evaluate the fine-tuned model targets = [] predictions = [] for i in range(len(tokenized_imdb["test"])): review = tokenized_imdb["test"][i]["text"] target_sentiment = tokenized_imdb["test"][i]["label"] predicted_sentiment = predict_sentiment(review) if predicted_sentiment in ["positive", "negative"]: targets.append(target_sentiment) predictions.append(predicted_sentiment) print(f"Record {i+1} - Actual: {target_sentiment}, Predicted: {predicted_sentiment}") return "Done" demo = gr.Interface(fn=process, inputs=[gr.Textbox(label = "Model ID", value = "codellama/CodeLlama-7b-hf", lines = 1), gr.Textbox(label = "Dataset", value = "open-assistance/prompting", lines = 1)], outputs=[gr.Textbox(label = "Completion")]) demo.launch()