Update app.py
Browse files
app.py
CHANGED
@@ -35,9 +35,10 @@ def prompt_model(model_id, system_prompt, user_prompt, schema):
|
|
35 |
return result
|
36 |
|
37 |
def fine_tune_model(base_model_id, dataset):
|
38 |
-
tokenizer = download_model(base_model_id)
|
39 |
-
|
40 |
-
fine_tuned_model_id = upload_model(base_model_id, tokenizer)
|
|
|
41 |
return fine_tuned_model_id
|
42 |
|
43 |
def download_model(base_model_id):
|
@@ -46,9 +47,32 @@ def download_model(base_model_id):
|
|
46 |
model.save_pretrained(base_model_id)
|
47 |
return tokenizer
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
def download_dataset(dataset):
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
def upload_model(base_model_id, tokenizer):
|
54 |
fine_tuned_model_id = replace_hf_profile(base_model_id)
|
|
|
35 |
return result
|
36 |
|
37 |
def fine_tune_model(base_model_id, dataset):
|
38 |
+
#tokenizer = download_model(base_model_id)
|
39 |
+
download_dataset(dataset)
|
40 |
+
#fine_tuned_model_id = upload_model(base_model_id, tokenizer)
|
41 |
+
fine_tuned_model_id = base_model_id # DELETE
|
42 |
return fine_tuned_model_id
|
43 |
|
44 |
def download_model(base_model_id):
|
|
|
47 |
model.save_pretrained(base_model_id)
|
48 |
return tokenizer
|
49 |
|
50 |
+
def create_conversation(sample):
|
51 |
+
return {
|
52 |
+
"messages": [
|
53 |
+
{"role": "system", "content": system_prompt.format(schema=sample["context"])},
|
54 |
+
{"role": "user", "content": sample["question"]},
|
55 |
+
{"role": "assistant", "content": sample["answer"]}
|
56 |
+
]
|
57 |
+
}
|
58 |
+
|
59 |
def download_dataset(dataset):
|
60 |
+
dataset = load_dataset("b-mc2/sql-create-context", split="train")
|
61 |
+
dataset = dataset.shuffle().select(range(12500))
|
62 |
+
|
63 |
+
# Convert dataset to OAI messages
|
64 |
+
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
|
65 |
+
# split dataset into 10,000 training samples and 2,500 test samples
|
66 |
+
dataset = dataset.train_test_split(test_size=2500/12500)
|
67 |
+
|
68 |
+
print(dataset["train"][345]["messages"])
|
69 |
+
|
70 |
+
# save datasets to disk
|
71 |
+
dataset["train"].to_json("train_dataset.json", orient="records")
|
72 |
+
dataset["test"].to_json("test_dataset.json", orient="records")
|
73 |
+
###
|
74 |
+
dataset = load_dataset("json", data_files="train_dataset.json", split="train")
|
75 |
+
return "Done"
|
76 |
|
77 |
def upload_model(base_model_id, tokenizer):
|
78 |
fine_tuned_model_id = replace_hf_profile(base_model_id)
|