erbacher commited on
Commit
be5ea5d
1 Parent(s): 20c7a36

Model save

Browse files
Files changed (5) hide show
  1. README.md +14 -11
  2. all_results.json +9 -9
  3. eval_results.json +5 -5
  4. train_results.json +5 -5
  5. trainer_state.json +104 -12
README.md CHANGED
@@ -2,13 +2,12 @@
2
  license: mit
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
 
9
  base_model: HuggingFaceH4/zephyr-7b-beta
10
  datasets:
11
- - erbacher/rag-and-roll
12
  model-index:
13
  - name: zephyr-rag-agent
14
  results: []
@@ -19,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # zephyr-rag-agent
21
 
22
- This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the erbacher/rag-and-roll dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.1623
25
 
26
  ## Model description
27
 
@@ -40,19 +39,19 @@ More information needed
40
  ### Training hyperparameters
41
 
42
  The following hyperparameters were used during training:
43
- - learning_rate: 0.0002
44
- - train_batch_size: 2
45
- - eval_batch_size: 2
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
  - num_devices: 2
49
  - gradient_accumulation_steps: 16
50
- - total_train_batch_size: 64
51
- - total_eval_batch_size: 4
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
- - num_epochs: 3
56
 
57
  ### Training results
58
 
@@ -61,6 +60,10 @@ The following hyperparameters were used during training:
61
  | 1.1559 | 0.99 | 27 | 1.1650 |
62
  | 1.0887 | 1.98 | 54 | 1.1555 |
63
  | 1.0566 | 2.97 | 81 | 1.1623 |
 
 
 
 
64
 
65
 
66
  ### Framework versions
 
2
  license: mit
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
+ - generated_from_trainer
8
  base_model: HuggingFaceH4/zephyr-7b-beta
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: zephyr-rag-agent
13
  results: []
 
18
 
19
  # zephyr-rag-agent
20
 
21
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.1829
24
 
25
  ## Model description
26
 
 
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
+ - learning_rate: 4e-05
43
+ - train_batch_size: 4
44
+ - eval_batch_size: 4
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
  - num_devices: 2
48
  - gradient_accumulation_steps: 16
49
+ - total_train_batch_size: 128
50
+ - total_eval_batch_size: 8
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
 
56
  ### Training results
57
 
 
60
  | 1.1559 | 0.99 | 27 | 1.1650 |
61
  | 1.0887 | 1.98 | 54 | 1.1555 |
62
  | 1.0566 | 2.97 | 81 | 1.1623 |
63
+ | 1.0264 | 6.95 | 91 | 1.1689 |
64
+ | 0.9977 | 7.97 | 105 | 1.1779 |
65
+ | 0.9808 | 9.0 | 119 | 1.1820 |
66
+ | 0.9791 | 9.8 | 130 | 1.1829 |
67
 
68
 
69
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 2.97,
3
- "eval_loss": 1.1623399257659912,
4
- "eval_runtime": 279.5897,
5
  "eval_samples": 716,
6
- "eval_samples_per_second": 2.167,
7
- "eval_steps_per_second": 0.544,
8
- "train_loss": 1.1298090528558802,
9
- "train_runtime": 14861.7425,
10
  "train_samples": 1922,
11
- "train_samples_per_second": 0.352,
12
- "train_steps_per_second": 0.005
13
  }
 
1
  {
2
+ "epoch": 9.8,
3
+ "eval_loss": 1.1828593015670776,
4
+ "eval_runtime": 207.1428,
5
  "eval_samples": 716,
6
+ "eval_samples_per_second": 2.926,
7
+ "eval_steps_per_second": 0.367,
8
+ "train_loss": 0.37551442659818207,
9
+ "train_runtime": 8428.1057,
10
  "train_samples": 1922,
11
+ "train_samples_per_second": 2.07,
12
+ "train_steps_per_second": 0.015
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.97,
3
- "eval_loss": 1.1623399257659912,
4
- "eval_runtime": 279.5897,
5
  "eval_samples": 716,
6
- "eval_samples_per_second": 2.167,
7
- "eval_steps_per_second": 0.544
8
  }
 
1
  {
2
+ "epoch": 9.8,
3
+ "eval_loss": 1.1828593015670776,
4
+ "eval_runtime": 207.1428,
5
  "eval_samples": 716,
6
+ "eval_samples_per_second": 2.926,
7
+ "eval_steps_per_second": 0.367
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.97,
3
- "train_loss": 1.1298090528558802,
4
- "train_runtime": 14861.7425,
5
  "train_samples": 1922,
6
- "train_samples_per_second": 0.352,
7
- "train_steps_per_second": 0.005
8
  }
 
1
  {
2
+ "epoch": 9.8,
3
+ "train_loss": 0.37551442659818207,
4
+ "train_runtime": 8428.1057,
5
  "train_samples": 1922,
6
+ "train_samples_per_second": 2.07,
7
+ "train_steps_per_second": 0.015
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.965675057208238,
5
  "eval_steps": 500,
6
- "global_step": 81,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -135,21 +135,113 @@
135
  "step": 81
136
  },
137
  {
138
- "epoch": 2.97,
139
- "step": 81,
140
- "total_flos": 173916983132160.0,
141
- "train_loss": 1.1298090528558802,
142
- "train_runtime": 14861.7425,
143
- "train_samples_per_second": 0.352,
144
- "train_steps_per_second": 0.005
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  }
146
  ],
147
  "logging_steps": 5,
148
- "max_steps": 81,
149
  "num_input_tokens_seen": 0,
150
- "num_train_epochs": 3,
151
  "save_steps": 500,
152
- "total_flos": 173916983132160.0,
153
  "train_batch_size": 2,
154
  "trial_name": null,
155
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.799086757990867,
5
  "eval_steps": 500,
6
+ "global_step": 130,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
135
  "step": 81
136
  },
137
  {
138
+ "epoch": 6.51,
139
+ "learning_rate": 6.453951129574644e-05,
140
+ "loss": 1.0309,
141
+ "step": 85
142
+ },
143
+ {
144
+ "epoch": 6.88,
145
+ "learning_rate": 5.234312799786921e-05,
146
+ "loss": 1.0264,
147
+ "step": 90
148
+ },
149
+ {
150
+ "epoch": 6.95,
151
+ "eval_loss": 1.168891191482544,
152
+ "eval_runtime": 208.2969,
153
+ "eval_samples_per_second": 2.909,
154
+ "eval_steps_per_second": 0.365,
155
+ "step": 91
156
+ },
157
+ {
158
+ "epoch": 7.24,
159
+ "learning_rate": 4.100445599768774e-05,
160
+ "loss": 1.0037,
161
+ "step": 95
162
+ },
163
+ {
164
+ "epoch": 7.61,
165
+ "learning_rate": 3.072756464904006e-05,
166
+ "loss": 1.0055,
167
+ "step": 100
168
+ },
169
+ {
170
+ "epoch": 7.97,
171
+ "learning_rate": 2.1697413758237784e-05,
172
+ "loss": 0.9977,
173
+ "step": 105
174
+ },
175
+ {
176
+ "epoch": 7.97,
177
+ "eval_loss": 1.1778818368911743,
178
+ "eval_runtime": 208.3439,
179
+ "eval_samples_per_second": 2.909,
180
+ "eval_steps_per_second": 0.365,
181
+ "step": 105
182
+ },
183
+ {
184
+ "epoch": 8.34,
185
+ "learning_rate": 1.4076524743778319e-05,
186
+ "loss": 0.9801,
187
+ "step": 110
188
+ },
189
+ {
190
+ "epoch": 8.7,
191
+ "learning_rate": 8.002055634117578e-06,
192
+ "loss": 0.9808,
193
+ "step": 115
194
+ },
195
+ {
196
+ "epoch": 9.0,
197
+ "eval_loss": 1.1819604635238647,
198
+ "eval_runtime": 208.427,
199
+ "eval_samples_per_second": 2.907,
200
+ "eval_steps_per_second": 0.365,
201
+ "step": 119
202
+ },
203
+ {
204
+ "epoch": 9.07,
205
+ "learning_rate": 3.5833325466437694e-06,
206
+ "loss": 0.9842,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 9.43,
211
+ "learning_rate": 8.998820754091531e-07,
212
+ "loss": 0.9813,
213
+ "step": 125
214
+ },
215
+ {
216
+ "epoch": 9.8,
217
+ "learning_rate": 0.0,
218
+ "loss": 0.9791,
219
+ "step": 130
220
+ },
221
+ {
222
+ "epoch": 9.8,
223
+ "eval_loss": 1.1828593015670776,
224
+ "eval_runtime": 208.4043,
225
+ "eval_samples_per_second": 2.908,
226
+ "eval_steps_per_second": 0.365,
227
+ "step": 130
228
+ },
229
+ {
230
+ "epoch": 9.8,
231
+ "step": 130,
232
+ "total_flos": 384243276447744.0,
233
+ "train_loss": 0.37551442659818207,
234
+ "train_runtime": 8428.1057,
235
+ "train_samples_per_second": 2.07,
236
+ "train_steps_per_second": 0.015
237
  }
238
  ],
239
  "logging_steps": 5,
240
+ "max_steps": 130,
241
  "num_input_tokens_seen": 0,
242
+ "num_train_epochs": 10,
243
  "save_steps": 500,
244
+ "total_flos": 384243276447744.0,
245
  "train_batch_size": 2,
246
  "trial_name": null,
247
  "trial_params": null