Spaces:

bstraehle
/

sft

Running

File size: 14,970 Bytes

# https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/sample_finetune.py
import gradio as gr
import os, torch
#from datasets import load_dataset
#from huggingface_hub import HfApi, login
#from peft import AutoPeftModelForCausalLM, LoraConfig
#from random import randint
#from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
#from trl import SFTTrainer, setup_chat_format

import datasets, sys, logging, torch, transformers
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer

# Fine-tune on NVidia 4xL4 (sleep after 10 hours)

hf_profile = "bstraehle"

action_1 = "Fine-tune pre-trained model"
action_2 = "Prompt fine-tuned model"

system_prompt = "You are a text to SQL query translator. Given a question in English, generate a SQL query based on the provided SCHEMA. Do not generate any additional text. SCHEMA: {schema}"
user_prompt = "What is the total trade value and average price for each trader and stock in the trade_history table?"
schema = "CREATE TABLE trade_history (id INT, trader_id INT, stock VARCHAR(255), price DECIMAL(5,2), quantity INT, trade_time TIMESTAMP);"

base_model_id = "microsoft/Phi-3-mini-4k-instruct"
dataset = "b-mc2/sql-create-context"

def prompt_model(model_id, system_prompt, user_prompt, schema):
    pipe = pipeline("text-generation", 
                    model=model_id, 
                    model_kwargs={"torch_dtype": torch.bfloat16}, 
                    device_map="auto",
                    max_new_tokens=1000)
    messages = [
      {"role": "system", "content": system_prompt.format(schema=schema)},
      {"role": "user", "content": user_prompt},
      {"role": "assistant", "content": ""}
    ]
    output = pipe(messages)
    result = output[0]["generated_text"][-1]["content"]
    print(result)
    return result
    
#    peft_model_id = "./code-llama-7b-text-to-sql"
#    # peft_model_id = args.output_dir
     
#    # Load Model with PEFT adapter
#    model = AutoPeftModelForCausalLM.from_pretrained(
#      peft_model_id,
#      device_map="auto",
#      torch_dtype=torch.float16
#    )
#    tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
#    # load into pipeline
#    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

    ###

#    eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
#    rand_idx = randint(0, len(eval_dataset))
     
#    # Test on sample
#    prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
#    outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
     
#    print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
#    print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
#    print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

def fine_tune_model(base_model_id, dataset):
    test(base_model_id, dataset)
    ##tokenizer = download_model(base_model_id)
    #prepare_dataset(dataset)
    #train_model(base_model_id)
    ##fine_tuned_model_id = upload_model(base_model_id, tokenizer)
    return "fine_tuned_model_id"

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_prompt.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Define the formatting function for the prompts
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = []
    mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
    end_mapper = {"system": "", "human": "", "gpt": ""}
    for convo in convos:
        text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
        texts.append(f"{text}{tokenizer.eos_token}")
    return {"text": texts}
    
def test(base_model_id, dataset):
    ###################
    # Hyper-parameters
    ###################
    training_config = {
        "bf16": True,
        "do_eval": False,
        "learning_rate": 5.0e-06,
        "log_level": "info",
        "logging_steps": 20,
        "logging_strategy": "steps",
        "lr_scheduler_type": "cosine",
        "num_train_epochs": 1,
        "max_steps": -1,
        "output_dir": "./checkpoint_dir",
        "overwrite_output_dir": True,
        "per_device_eval_batch_size": 4,
        "per_device_train_batch_size": 4,
        "remove_unused_columns": True,
        "save_steps": 100,
        "save_total_limit": 1,
        "seed": 0,
        "gradient_checkpointing": True,
        "gradient_checkpointing_kwargs":{"use_reentrant": False},
        "gradient_accumulation_steps": 1,
        "warmup_ratio": 0.2,
        }
    
    peft_config = {
        "r": 16,
        "lora_alpha": 32,
        "lora_dropout": 0.05,
        "bias": "none",
        "task_type": "CAUSAL_LM",
        "target_modules": "all-linear",
        "modules_to_save": None,
    }
    train_conf = TrainingArguments(**training_config)
    peft_conf = LoraConfig(**peft_config)
    
    
    ###############
    # Setup logging
    ###############
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    log_level = train_conf.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()
    
    # Log on each process a small summary
    logger.warning(
        f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
        + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
    )
    logger.info(f"Training/evaluation parameters {train_conf}")
    logger.info(f"PEFT parameters {peft_conf}")
    
    
    ################
    # Model Loading
    ################
    checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
    # checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
    model_kwargs = dict(
        use_cache=False,
        trust_remote_code=True,
        attn_implementation="flash_attention_2",  # loading the model with flash-attenstion support
        torch_dtype=torch.bfloat16,
        device_map=None
    )
    model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
    tokenizer.model_max_length = 2048
    tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    tokenizer.padding_side = 'right'
    
    
    ##################
    # Data Processing
    ##################
    def apply_chat_template(
        example,
        tokenizer,
    ):
        messages = example["messages"]
        example["text"] = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=False)
        return example
    
    raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
    train_dataset = raw_dataset["train_sft"]
    test_dataset = raw_dataset["test_sft"]
    column_names = list(train_dataset.features)
    
    processed_train_dataset = train_dataset.map(
        apply_chat_template,
        fn_kwargs={"tokenizer": tokenizer},
        num_proc=10,
        remove_columns=column_names,
        desc="Applying chat template to train_sft",
    )
    
    processed_test_dataset = test_dataset.map(
        apply_chat_template,
        fn_kwargs={"tokenizer": tokenizer},
        num_proc=10,
        remove_columns=column_names,
        desc="Applying chat template to test_sft",
    )
    
    
    ###########
    # Training
    ###########
    trainer = SFTTrainer(
        model=model,
        args=train_conf,
        peft_config=peft_conf,
        train_dataset=processed_train_dataset,
        eval_dataset=processed_test_dataset,
        max_seq_length=2048,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=True
    )
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    
    
    #############
    # Evaluation
    #############
    tokenizer.padding_side = 'left'
    metrics = trainer.evaluate()
    metrics["eval_samples"] = len(processed_test_dataset)
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)
    
    
    # ############
    # # Save model
    # ############
    trainer.save_model(train_conf.output_dir)

def download_model(base_model_id):
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    model = AutoModelForCausalLM.from_pretrained(base_model_id)
    model.save_pretrained(base_model_id)
    return tokenizer
    
def prepare_dataset(dataset):
    dataset = load_dataset(dataset, split="train")
    dataset = dataset.shuffle().select(range(12500))
     
    # Convert dataset to OAI messages
    dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
    # split dataset into 10,000 training samples and 2,500 test samples
    dataset = dataset.train_test_split(test_size=2500/12500)
     
    print(dataset["train"][345]["messages"])
     
    # save datasets to disk
    dataset["train"].to_json("train_dataset.json", orient="records")
    dataset["test"].to_json("test_dataset.json", orient="records")
    ###

def train_model(model_id):
    print("111")
    dataset = load_dataset("json", data_files="train_dataset.json", split="train")

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
    )

    print("222")
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        #attn_implementation="flash_attention_2",
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.padding_side = 'right' # to prevent warnings

    print("333")
    # # set chat template to OAI chatML, remove if you start from a fine-tuned model
    model, tokenizer = setup_chat_format(model, tokenizer)

    peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
    )

    print("444")
    args = TrainingArguments(
        output_dir="code-llama-7b-text-to-sql", # directory to save and repository id
        num_train_epochs=3,                     # number of training epochs
        per_device_train_batch_size=3,          # batch size per device during training
        gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
        gradient_checkpointing=True,            # use gradient checkpointing to save memory
        optim="adamw_torch_fused",              # use fused adamw optimizer
        logging_steps=10,                       # log every 10 steps
        save_strategy="epoch",                  # save checkpoint every epoch
        learning_rate=2e-4,                     # learning rate, based on QLoRA paper
        bf16=True,                              # use bfloat16 precision
        tf32=True,                              # use tf32 precision
        max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
        warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
        lr_scheduler_type="constant",           # use constant learning rate scheduler
        push_to_hub=True,                       # push model to hub
        report_to="tensorboard",                # report metrics to tensorboard
    )

    max_seq_length = 3072 # max sequence length for model and packing of the dataset

    print("555")
    trainer = SFTTrainer(
        model=model,
        args=args,
        train_dataset=dataset,
        peft_config=peft_config,
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        packing=True,
        dataset_kwargs={
            "add_special_tokens": False,  # We template with special tokens
            "append_concat_token": False, # No need to add additional separator token
        }
    )

    print("666")
    # start training, the model will be automatically saved to the hub and the output directory
    trainer.train()
     
    print("777")
    # save model
    trainer.save_model()

    del model
    del trainer
    torch.cuda.empty_cache()
    
def upload_model(base_model_id, tokenizer):
    fine_tuned_model_id = replace_hf_profile(base_model_id)
    login(token=os.environ["HF_TOKEN"])
    api = HfApi()
    #api.delete_repo(repo_id=fine_tuned_model_id, repo_type="model")
    api.create_repo(repo_id=fine_tuned_model_id)
    api.upload_folder(
        folder_path=base_model_id,
        repo_id=fine_tuned_model_id
    )
    tokenizer.push_to_hub(fine_tuned_model_id)
    return fine_tuned_model_id

def replace_hf_profile(base_model_id):
    model_id = base_model_id[base_model_id.rfind('/')+1:]
    return f"{hf_profile}/{model_id}"

def process(action, base_model_id, dataset, system_prompt, user_prompt, schema):
    #raise gr.Error("Please clone and bring your own credentials.")
    if action == action_1:
        result = fine_tune_model(base_model_id, dataset)
    elif action == action_2:
        fine_tuned_model_id = replace_hf_profile(base_model_id)
        result = prompt_model(fine_tuned_model_id, system_prompt, user_prompt, schema)
    return result

demo = gr.Interface(fn=process, 
                    inputs=[gr.Radio([action_1, action_2], label = "Action", value = action_1),
                            gr.Textbox(label = "Base Model ID", value = base_model_id, lines = 1),
                            gr.Textbox(label = "Dataset", value = dataset, lines = 1),
                            gr.Textbox(label = "System Prompt", value = system_prompt, lines = 2),
                            gr.Textbox(label = "User Prompt", value = user_prompt, lines = 2),
                            gr.Textbox(label = "Schema", value = schema, lines = 2)],
                    outputs=[gr.Textbox(label = "Completion", value = os.environ["OUTPUT"])])
demo.launch()