sft / app.py
bstraehle's picture
Update app.py
d91c99b verified
raw
history blame
3.73 kB
import gradio as gr
import torch
import pandas as pd
import bitsandbytes as bnb
import evaluate
import numpy as np
import random
import huggingface_hub
import os
from datasets import Dataset, DatasetDict, load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoModelForSequenceClassification, Trainer, EarlyStoppingCallback, DataCollatorWithPadding)
from huggingface_hub import login
def process(model_id, dataset):
# Step 1: Load dataset
dataset_imdb = load_dataset(dataset)
# Step 2: Reduce dataset (optional)
reduction_rate = 0.1
num_train_to_keep = int(reduction_rate * dataset_imdb["train"].num_rows)
num_test_to_keep = int(reduction_rate * dataset_imdb["test"].num_rows)
def select_random_indices(dataset, num_to_keep):
indices = list(range(dataset.num_rows))
random.shuffle(indices)
return indices[:num_to_keep]
train_indices = select_random_indices(dataset_imdb["train"], num_train_to_keep)
test_indices = select_random_indices(dataset_imdb["test"], num_test_to_keep)
dataset_imdb = DatasetDict({
"train": dataset_imdb["train"].select(train_indices),
"test": dataset_imdb["test"].select(test_indices),
})
# Step 3: Text tokenization
def preprocess_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# Step 4: Apply tokenization to dataset
tokenized_imdb = dataset_imdb.map(preprocess_function, batched=True)
#Step 5: Fine-tune the model
login(token=os.environ.get("HF_TOKEN"))
model_id = model_id
model = AutoModelForSequenceClassification.from_pretrained(model_id)
lora_config = LoraConfig(task="sequence_classification")
peft_model = PeftModel(model, lora_config)
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
evaluation_strategy="epoch",
learning_rate=1e-5,
save_total_limit=2,
save_steps=500,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
save_strategy="steps",
eval_accumulation_steps=10,
)
trainer = Trainer(
model=peft_model,
args=training_args,
train_dataset=tokenized_imdb["train"],
eval_dataset=tokenized_imdb["test"],
compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1)).item()},
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()
# Step 6: Evaluate the fine-tuned model
targets = []
predictions = []
for i in range(len(tokenized_imdb["test"])):
review = tokenized_imdb["test"][i]["text"]
target_sentiment = tokenized_imdb["test"][i]["label"]
predicted_sentiment = predict_sentiment(review)
if predicted_sentiment in ["positive", "negative"]:
targets.append(target_sentiment)
predictions.append(predicted_sentiment)
print(f"Record {i+1} - Actual: {target_sentiment}, Predicted: {predicted_sentiment}")
return "Done"
demo = gr.Interface(fn=process,
inputs=[gr.Textbox(label = "Model ID", value = "google/gemma-7b", lines = 1),
gr.Textbox(label = "Dataset", value = "imdb", lines = 1)],
outputs=[gr.Textbox(label = "Completion")])
demo.launch()