|
import gradio as gr |
|
import torch |
|
import pandas as pd |
|
import bitsandbytes as bnb |
|
import evaluate |
|
import numpy as np |
|
import random |
|
import huggingface_hub |
|
import os |
|
from datasets import Dataset, DatasetDict, load_dataset |
|
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model |
|
from transformers import (AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoModelForSequenceClassification, Trainer, EarlyStoppingCallback, DataCollatorWithPadding) |
|
from huggingface_hub import login |
|
|
|
def process(model_id, dataset): |
|
|
|
dataset_imdb = load_dataset(dataset) |
|
|
|
|
|
|
|
reduction_rate = 0.1 |
|
num_train_to_keep = int(reduction_rate * dataset_imdb["train"].num_rows) |
|
num_test_to_keep = int(reduction_rate * dataset_imdb["test"].num_rows) |
|
|
|
def select_random_indices(dataset, num_to_keep): |
|
indices = list(range(dataset.num_rows)) |
|
random.shuffle(indices) |
|
return indices[:num_to_keep] |
|
|
|
train_indices = select_random_indices(dataset_imdb["train"], num_train_to_keep) |
|
test_indices = select_random_indices(dataset_imdb["test"], num_test_to_keep) |
|
|
|
dataset_imdb = DatasetDict({ |
|
"train": dataset_imdb["train"].select(train_indices), |
|
"test": dataset_imdb["test"].select(test_indices), |
|
}) |
|
|
|
|
|
|
|
def preprocess_function(examples): |
|
return tokenizer(examples["text"], padding="max_length", truncation=True) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") |
|
|
|
|
|
|
|
tokenized_imdb = dataset_imdb.map(preprocess_function, batched=True) |
|
|
|
|
|
|
|
login(token=os.environ.get("HF_TOKEN")) |
|
|
|
model_id = model_id |
|
model = AutoModelForSequenceClassification.from_pretrained(model_id) |
|
|
|
lora_config = LoraConfig(task="sequence_classification") |
|
peft_model = PeftModel(model, lora_config) |
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
num_train_epochs=3, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=64, |
|
evaluation_strategy="epoch", |
|
learning_rate=1e-5, |
|
save_total_limit=2, |
|
save_steps=500, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="accuracy", |
|
greater_is_better=True, |
|
save_strategy="steps", |
|
eval_accumulation_steps=10, |
|
) |
|
|
|
trainer = Trainer( |
|
model=peft_model, |
|
args=training_args, |
|
train_dataset=tokenized_imdb["train"], |
|
eval_dataset=tokenized_imdb["test"], |
|
compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1)).item()}, |
|
data_collator=DataCollatorWithPadding(tokenizer=tokenizer), |
|
) |
|
|
|
trainer.train() |
|
|
|
|
|
|
|
targets = [] |
|
predictions = [] |
|
for i in range(len(tokenized_imdb["test"])): |
|
review = tokenized_imdb["test"][i]["text"] |
|
target_sentiment = tokenized_imdb["test"][i]["label"] |
|
predicted_sentiment = predict_sentiment(review) |
|
if predicted_sentiment in ["positive", "negative"]: |
|
targets.append(target_sentiment) |
|
predictions.append(predicted_sentiment) |
|
print(f"Record {i+1} - Actual: {target_sentiment}, Predicted: {predicted_sentiment}") |
|
|
|
return "Done" |
|
|
|
demo = gr.Interface(fn=process, |
|
inputs=[gr.Textbox(label = "Model ID", value = "codellama/CodeLlama-7b-hf", lines = 1), |
|
gr.Textbox(label = "Dataset", value = "open-assistance/prompting", lines = 1)], |
|
outputs=[gr.Textbox(label = "Completion")]) |
|
demo.launch() |