bstraehle commited on
Commit
01e1b5d
1 Parent(s): 022150f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -5
app.py CHANGED
@@ -35,9 +35,10 @@ def prompt_model(model_id, system_prompt, user_prompt, schema):
35
  return result
36
 
37
  def fine_tune_model(base_model_id, dataset):
38
- tokenizer = download_model(base_model_id)
39
- #download_dataset(dataset)
40
- fine_tuned_model_id = upload_model(base_model_id, tokenizer)
 
41
  return fine_tuned_model_id
42
 
43
  def download_model(base_model_id):
@@ -46,9 +47,32 @@ def download_model(base_model_id):
46
  model.save_pretrained(base_model_id)
47
  return tokenizer
48
 
 
 
 
 
 
 
 
 
 
49
  def download_dataset(dataset):
50
- ds = load_dataset(dataset)
51
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def upload_model(base_model_id, tokenizer):
54
  fine_tuned_model_id = replace_hf_profile(base_model_id)
 
35
  return result
36
 
37
  def fine_tune_model(base_model_id, dataset):
38
+ #tokenizer = download_model(base_model_id)
39
+ download_dataset(dataset)
40
+ #fine_tuned_model_id = upload_model(base_model_id, tokenizer)
41
+ fine_tuned_model_id = base_model_id # DELETE
42
  return fine_tuned_model_id
43
 
44
  def download_model(base_model_id):
 
47
  model.save_pretrained(base_model_id)
48
  return tokenizer
49
 
50
+ def create_conversation(sample):
51
+ return {
52
+ "messages": [
53
+ {"role": "system", "content": system_prompt.format(schema=sample["context"])},
54
+ {"role": "user", "content": sample["question"]},
55
+ {"role": "assistant", "content": sample["answer"]}
56
+ ]
57
+ }
58
+
59
  def download_dataset(dataset):
60
+ dataset = load_dataset("b-mc2/sql-create-context", split="train")
61
+ dataset = dataset.shuffle().select(range(12500))
62
+
63
+ # Convert dataset to OAI messages
64
+ dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
65
+ # split dataset into 10,000 training samples and 2,500 test samples
66
+ dataset = dataset.train_test_split(test_size=2500/12500)
67
+
68
+ print(dataset["train"][345]["messages"])
69
+
70
+ # save datasets to disk
71
+ dataset["train"].to_json("train_dataset.json", orient="records")
72
+ dataset["test"].to_json("test_dataset.json", orient="records")
73
+ ###
74
+ dataset = load_dataset("json", data_files="train_dataset.json", split="train")
75
+ return "Done"
76
 
77
  def upload_model(base_model_id, tokenizer):
78
  fine_tuned_model_id = replace_hf_profile(base_model_id)