File size: 3,727 Bytes
083fde1 b937d88 3a5ed7f 1f5f48f 3a5ed7f ffbcd18 083fde1 2371111 b937d88 2371111 18cfe84 df16a07 2371111 a3d3267 2371111 ffbcd18 3a5ed7f 2371111 b937d88 083fde1 2371111 d91c99b 2371111 083fde1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import gradio as gr
import torch
import pandas as pd
import bitsandbytes as bnb
import evaluate
import numpy as np
import random
import huggingface_hub
import os
from datasets import Dataset, DatasetDict, load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoModelForSequenceClassification, Trainer, EarlyStoppingCallback, DataCollatorWithPadding)
from huggingface_hub import login
def process(model_id, dataset):
# Step 1: Load dataset
dataset_imdb = load_dataset(dataset)
# Step 2: Reduce dataset (optional)
reduction_rate = 0.1
num_train_to_keep = int(reduction_rate * dataset_imdb["train"].num_rows)
num_test_to_keep = int(reduction_rate * dataset_imdb["test"].num_rows)
def select_random_indices(dataset, num_to_keep):
indices = list(range(dataset.num_rows))
random.shuffle(indices)
return indices[:num_to_keep]
train_indices = select_random_indices(dataset_imdb["train"], num_train_to_keep)
test_indices = select_random_indices(dataset_imdb["test"], num_test_to_keep)
dataset_imdb = DatasetDict({
"train": dataset_imdb["train"].select(train_indices),
"test": dataset_imdb["test"].select(test_indices),
})
# Step 3: Text tokenization
def preprocess_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# Step 4: Apply tokenization to dataset
tokenized_imdb = dataset_imdb.map(preprocess_function, batched=True)
#Step 5: Fine-tune the model
login(token=os.environ.get("HF_TOKEN"))
model_id = model_id
model = AutoModelForSequenceClassification.from_pretrained(model_id)
lora_config = LoraConfig(task="sequence_classification")
peft_model = PeftModel(model, lora_config)
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
evaluation_strategy="epoch",
learning_rate=1e-5,
save_total_limit=2,
save_steps=500,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
save_strategy="steps",
eval_accumulation_steps=10,
)
trainer = Trainer(
model=peft_model,
args=training_args,
train_dataset=tokenized_imdb["train"],
eval_dataset=tokenized_imdb["test"],
compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1)).item()},
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()
# Step 6: Evaluate the fine-tuned model
targets = []
predictions = []
for i in range(len(tokenized_imdb["test"])):
review = tokenized_imdb["test"][i]["text"]
target_sentiment = tokenized_imdb["test"][i]["label"]
predicted_sentiment = predict_sentiment(review)
if predicted_sentiment in ["positive", "negative"]:
targets.append(target_sentiment)
predictions.append(predicted_sentiment)
print(f"Record {i+1} - Actual: {target_sentiment}, Predicted: {predicted_sentiment}")
return "Done"
demo = gr.Interface(fn=process,
inputs=[gr.Textbox(label = "Model ID", value = "google/gemma-7b", lines = 1),
gr.Textbox(label = "Dataset", value = "imdb", lines = 1)],
outputs=[gr.Textbox(label = "Completion")])
demo.launch() |