Qin Liu commited on
Commit
63a93d0
1 Parent(s): a24dc21

Model save

Browse files
Files changed (4) hide show
  1. README.md +61 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +1008 -0
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Meta-Llama-3-8B-Instruct
3
+ library_name: peft
4
+ license: llama3
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: llama3-sudo-3epochs-mask
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # llama3-sudo-3epochs-mask
18
+
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on an unknown dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.0002
39
+ - train_batch_size: 8
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 8
44
+ - total_train_batch_size: 64
45
+ - total_eval_batch_size: 64
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: cosine
48
+ - lr_scheduler_warmup_ratio: 0.1
49
+ - num_epochs: 3
50
+
51
+ ### Training results
52
+
53
+
54
+
55
+ ### Framework versions
56
+
57
+ - PEFT 0.12.0
58
+ - Transformers 4.44.0
59
+ - Pytorch 2.1.2
60
+ - Datasets 2.20.0
61
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 1562393690767360.0,
4
+ "train_loss": 1.5581738730184898,
5
+ "train_runtime": 1339.3383,
6
+ "train_samples": 14642,
7
+ "train_samples_per_second": 32.797,
8
+ "train_steps_per_second": 0.513
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 1562393690767360.0,
4
+ "train_loss": 1.5581738730184898,
5
+ "train_runtime": 1339.3383,
6
+ "train_samples": 14642,
7
+ "train_samples_per_second": 32.797,
8
+ "train_steps_per_second": 0.513
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1008 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 687,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.004366812227074236,
13
+ "grad_norm": 1.8753604454650958,
14
+ "learning_rate": 2.898550724637681e-06,
15
+ "loss": 3.7085,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.021834061135371178,
20
+ "grad_norm": 1.801543636467953,
21
+ "learning_rate": 1.4492753623188407e-05,
22
+ "loss": 3.628,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.043668122270742356,
27
+ "grad_norm": 2.2358499812546344,
28
+ "learning_rate": 2.8985507246376814e-05,
29
+ "loss": 3.5205,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.06550218340611354,
34
+ "grad_norm": 3.2738653822216937,
35
+ "learning_rate": 4.347826086956522e-05,
36
+ "loss": 3.3516,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.08733624454148471,
41
+ "grad_norm": 1.2942851623807274,
42
+ "learning_rate": 5.797101449275363e-05,
43
+ "loss": 2.7683,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.1091703056768559,
48
+ "grad_norm": 0.6794530467895241,
49
+ "learning_rate": 7.246376811594203e-05,
50
+ "loss": 2.4019,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.13100436681222707,
55
+ "grad_norm": 1.2145036683171,
56
+ "learning_rate": 8.695652173913044e-05,
57
+ "loss": 2.1421,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.15283842794759825,
62
+ "grad_norm": 0.8541800089072681,
63
+ "learning_rate": 0.00010144927536231885,
64
+ "loss": 1.9309,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.17467248908296942,
69
+ "grad_norm": 0.5114546320288847,
70
+ "learning_rate": 0.00011594202898550725,
71
+ "loss": 1.9047,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 0.1965065502183406,
76
+ "grad_norm": 0.4569825196048275,
77
+ "learning_rate": 0.00013043478260869567,
78
+ "loss": 1.8231,
79
+ "step": 45
80
+ },
81
+ {
82
+ "epoch": 0.2183406113537118,
83
+ "grad_norm": 0.5192344724636687,
84
+ "learning_rate": 0.00014492753623188405,
85
+ "loss": 1.7305,
86
+ "step": 50
87
+ },
88
+ {
89
+ "epoch": 0.24017467248908297,
90
+ "grad_norm": 0.34840209414620005,
91
+ "learning_rate": 0.00015942028985507247,
92
+ "loss": 1.7933,
93
+ "step": 55
94
+ },
95
+ {
96
+ "epoch": 0.26200873362445415,
97
+ "grad_norm": 0.3678371866239178,
98
+ "learning_rate": 0.00017391304347826088,
99
+ "loss": 1.6143,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.2838427947598253,
104
+ "grad_norm": 0.35400819586929,
105
+ "learning_rate": 0.00018840579710144927,
106
+ "loss": 1.7367,
107
+ "step": 65
108
+ },
109
+ {
110
+ "epoch": 0.3056768558951965,
111
+ "grad_norm": 0.38841557534962395,
112
+ "learning_rate": 0.00019999870791268066,
113
+ "loss": 1.6669,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.32751091703056767,
118
+ "grad_norm": 0.4474417485361201,
119
+ "learning_rate": 0.00019995348836233516,
120
+ "loss": 1.601,
121
+ "step": 75
122
+ },
123
+ {
124
+ "epoch": 0.34934497816593885,
125
+ "grad_norm": 0.35698828488835516,
126
+ "learning_rate": 0.00019984369783193688,
127
+ "loss": 1.5357,
128
+ "step": 80
129
+ },
130
+ {
131
+ "epoch": 0.37117903930131,
132
+ "grad_norm": 0.3585527157477228,
133
+ "learning_rate": 0.00019966940724729603,
134
+ "loss": 1.6221,
135
+ "step": 85
136
+ },
137
+ {
138
+ "epoch": 0.3930131004366812,
139
+ "grad_norm": 0.34800578812888927,
140
+ "learning_rate": 0.0001994307292019204,
141
+ "loss": 1.6741,
142
+ "step": 90
143
+ },
144
+ {
145
+ "epoch": 0.4148471615720524,
146
+ "grad_norm": 0.35237447031832875,
147
+ "learning_rate": 0.0001991278178842786,
148
+ "loss": 1.5682,
149
+ "step": 95
150
+ },
151
+ {
152
+ "epoch": 0.4366812227074236,
153
+ "grad_norm": 0.39076567242886867,
154
+ "learning_rate": 0.00019876086897819284,
155
+ "loss": 1.5629,
156
+ "step": 100
157
+ },
158
+ {
159
+ "epoch": 0.4585152838427948,
160
+ "grad_norm": 0.34983112900974217,
161
+ "learning_rate": 0.00019833011953642525,
162
+ "loss": 1.6151,
163
+ "step": 105
164
+ },
165
+ {
166
+ "epoch": 0.48034934497816595,
167
+ "grad_norm": 0.34530341844436674,
168
+ "learning_rate": 0.00019783584782753918,
169
+ "loss": 1.5494,
170
+ "step": 110
171
+ },
172
+ {
173
+ "epoch": 0.5021834061135371,
174
+ "grad_norm": 0.38496791747153597,
175
+ "learning_rate": 0.00019727837315613504,
176
+ "loss": 1.5526,
177
+ "step": 115
178
+ },
179
+ {
180
+ "epoch": 0.5240174672489083,
181
+ "grad_norm": 0.35057371546794386,
182
+ "learning_rate": 0.00019665805565657603,
183
+ "loss": 1.5933,
184
+ "step": 120
185
+ },
186
+ {
187
+ "epoch": 0.5458515283842795,
188
+ "grad_norm": 0.35636722375213375,
189
+ "learning_rate": 0.00019597529606033782,
190
+ "loss": 1.5726,
191
+ "step": 125
192
+ },
193
+ {
194
+ "epoch": 0.5676855895196506,
195
+ "grad_norm": 0.375075170102756,
196
+ "learning_rate": 0.0001952305354371319,
197
+ "loss": 1.572,
198
+ "step": 130
199
+ },
200
+ {
201
+ "epoch": 0.5895196506550219,
202
+ "grad_norm": 0.38062791914729815,
203
+ "learning_rate": 0.00019442425490996988,
204
+ "loss": 1.5393,
205
+ "step": 135
206
+ },
207
+ {
208
+ "epoch": 0.611353711790393,
209
+ "grad_norm": 0.3465840806200646,
210
+ "learning_rate": 0.0001935569753443532,
211
+ "loss": 1.502,
212
+ "step": 140
213
+ },
214
+ {
215
+ "epoch": 0.6331877729257642,
216
+ "grad_norm": 0.3566623409862706,
217
+ "learning_rate": 0.00019262925701178866,
218
+ "loss": 1.6075,
219
+ "step": 145
220
+ },
221
+ {
222
+ "epoch": 0.6550218340611353,
223
+ "grad_norm": 0.3513669246328785,
224
+ "learning_rate": 0.00019164169922784716,
225
+ "loss": 1.5125,
226
+ "step": 150
227
+ },
228
+ {
229
+ "epoch": 0.6768558951965066,
230
+ "grad_norm": 0.34755655244144995,
231
+ "learning_rate": 0.00019059493996499986,
232
+ "loss": 1.6031,
233
+ "step": 155
234
+ },
235
+ {
236
+ "epoch": 0.6986899563318777,
237
+ "grad_norm": 0.35330261801172397,
238
+ "learning_rate": 0.00018948965544048128,
239
+ "loss": 1.5862,
240
+ "step": 160
241
+ },
242
+ {
243
+ "epoch": 0.7205240174672489,
244
+ "grad_norm": 0.3540773871103764,
245
+ "learning_rate": 0.00018832655967944607,
246
+ "loss": 1.6557,
247
+ "step": 165
248
+ },
249
+ {
250
+ "epoch": 0.74235807860262,
251
+ "grad_norm": 0.3385589855658203,
252
+ "learning_rate": 0.00018710640405370145,
253
+ "loss": 1.5771,
254
+ "step": 170
255
+ },
256
+ {
257
+ "epoch": 0.7641921397379913,
258
+ "grad_norm": 0.3948832031761469,
259
+ "learning_rate": 0.00018582997679631315,
260
+ "loss": 1.5896,
261
+ "step": 175
262
+ },
263
+ {
264
+ "epoch": 0.7860262008733624,
265
+ "grad_norm": 0.3414806859938428,
266
+ "learning_rate": 0.00018449810249239902,
267
+ "loss": 1.5278,
268
+ "step": 180
269
+ },
270
+ {
271
+ "epoch": 0.8078602620087336,
272
+ "grad_norm": 0.3306470223687427,
273
+ "learning_rate": 0.00018311164154643836,
274
+ "loss": 1.4916,
275
+ "step": 185
276
+ },
277
+ {
278
+ "epoch": 0.8296943231441049,
279
+ "grad_norm": 0.37143716375731756,
280
+ "learning_rate": 0.00018167148962644193,
281
+ "loss": 1.625,
282
+ "step": 190
283
+ },
284
+ {
285
+ "epoch": 0.851528384279476,
286
+ "grad_norm": 0.3610023951445998,
287
+ "learning_rate": 0.00018017857708534107,
288
+ "loss": 1.6859,
289
+ "step": 195
290
+ },
291
+ {
292
+ "epoch": 0.8733624454148472,
293
+ "grad_norm": 0.3969039590867808,
294
+ "learning_rate": 0.00017863386835997028,
295
+ "loss": 1.6366,
296
+ "step": 200
297
+ },
298
+ {
299
+ "epoch": 0.8951965065502183,
300
+ "grad_norm": 0.3466218548962297,
301
+ "learning_rate": 0.00017703836134803105,
302
+ "loss": 1.4699,
303
+ "step": 205
304
+ },
305
+ {
306
+ "epoch": 0.9170305676855895,
307
+ "grad_norm": 0.35771965160066704,
308
+ "learning_rate": 0.00017539308676343973,
309
+ "loss": 1.5723,
310
+ "step": 210
311
+ },
312
+ {
313
+ "epoch": 0.9388646288209607,
314
+ "grad_norm": 0.37691637209544226,
315
+ "learning_rate": 0.00017369910747047572,
316
+ "loss": 1.584,
317
+ "step": 215
318
+ },
319
+ {
320
+ "epoch": 0.9606986899563319,
321
+ "grad_norm": 0.40411027091446045,
322
+ "learning_rate": 0.00017195751779716027,
323
+ "loss": 1.6019,
324
+ "step": 220
325
+ },
326
+ {
327
+ "epoch": 0.982532751091703,
328
+ "grad_norm": 0.3742848841742168,
329
+ "learning_rate": 0.00017016944282830933,
330
+ "loss": 1.4947,
331
+ "step": 225
332
+ },
333
+ {
334
+ "epoch": 1.0043668122270741,
335
+ "grad_norm": 0.38407235492099207,
336
+ "learning_rate": 0.00016833603767871713,
337
+ "loss": 1.5812,
338
+ "step": 230
339
+ },
340
+ {
341
+ "epoch": 1.0262008733624455,
342
+ "grad_norm": 0.37035584561109924,
343
+ "learning_rate": 0.0001664584867469403,
344
+ "loss": 1.5113,
345
+ "step": 235
346
+ },
347
+ {
348
+ "epoch": 1.0480349344978166,
349
+ "grad_norm": 0.4171769928455508,
350
+ "learning_rate": 0.0001645380029501641,
351
+ "loss": 1.427,
352
+ "step": 240
353
+ },
354
+ {
355
+ "epoch": 1.0698689956331877,
356
+ "grad_norm": 0.4014807349448744,
357
+ "learning_rate": 0.00016257582694064558,
358
+ "loss": 1.488,
359
+ "step": 245
360
+ },
361
+ {
362
+ "epoch": 1.091703056768559,
363
+ "grad_norm": 0.41603472494317567,
364
+ "learning_rate": 0.00016057322630423935,
365
+ "loss": 1.4085,
366
+ "step": 250
367
+ },
368
+ {
369
+ "epoch": 1.1135371179039302,
370
+ "grad_norm": 0.41696638930397173,
371
+ "learning_rate": 0.00015853149474152423,
372
+ "loss": 1.417,
373
+ "step": 255
374
+ },
375
+ {
376
+ "epoch": 1.1353711790393013,
377
+ "grad_norm": 0.42289781015455985,
378
+ "learning_rate": 0.0001564519512320593,
379
+ "loss": 1.4374,
380
+ "step": 260
381
+ },
382
+ {
383
+ "epoch": 1.1572052401746724,
384
+ "grad_norm": 0.4136181415806817,
385
+ "learning_rate": 0.00015433593918230955,
386
+ "loss": 1.5384,
387
+ "step": 265
388
+ },
389
+ {
390
+ "epoch": 1.1790393013100438,
391
+ "grad_norm": 0.4304951924614819,
392
+ "learning_rate": 0.00015218482555779165,
393
+ "loss": 1.4184,
394
+ "step": 270
395
+ },
396
+ {
397
+ "epoch": 1.2008733624454149,
398
+ "grad_norm": 0.4567200712003195,
399
+ "learning_rate": 0.00015000000000000001,
400
+ "loss": 1.5038,
401
+ "step": 275
402
+ },
403
+ {
404
+ "epoch": 1.222707423580786,
405
+ "grad_norm": 0.418159543142196,
406
+ "learning_rate": 0.00014778287392868417,
407
+ "loss": 1.4477,
408
+ "step": 280
409
+ },
410
+ {
411
+ "epoch": 1.244541484716157,
412
+ "grad_norm": 0.43554491426674474,
413
+ "learning_rate": 0.0001455348796300571,
414
+ "loss": 1.362,
415
+ "step": 285
416
+ },
417
+ {
418
+ "epoch": 1.2663755458515285,
419
+ "grad_norm": 0.4303389630106061,
420
+ "learning_rate": 0.0001432574693315238,
421
+ "loss": 1.5283,
422
+ "step": 290
423
+ },
424
+ {
425
+ "epoch": 1.2882096069868996,
426
+ "grad_norm": 0.4434624782109262,
427
+ "learning_rate": 0.0001409521142635272,
428
+ "loss": 1.5192,
429
+ "step": 295
430
+ },
431
+ {
432
+ "epoch": 1.3100436681222707,
433
+ "grad_norm": 0.4160064760695591,
434
+ "learning_rate": 0.0001386203037091183,
435
+ "loss": 1.5222,
436
+ "step": 300
437
+ },
438
+ {
439
+ "epoch": 1.3318777292576418,
440
+ "grad_norm": 0.44897734162151265,
441
+ "learning_rate": 0.00013626354404186404,
442
+ "loss": 1.5612,
443
+ "step": 305
444
+ },
445
+ {
446
+ "epoch": 1.3537117903930131,
447
+ "grad_norm": 0.4378993469096492,
448
+ "learning_rate": 0.00013388335775271467,
449
+ "loss": 1.4442,
450
+ "step": 310
451
+ },
452
+ {
453
+ "epoch": 1.3755458515283843,
454
+ "grad_norm": 0.4403089794683692,
455
+ "learning_rate": 0.0001314812824664585,
456
+ "loss": 1.5151,
457
+ "step": 315
458
+ },
459
+ {
460
+ "epoch": 1.3973799126637554,
461
+ "grad_norm": 0.44038010399377847,
462
+ "learning_rate": 0.000129058869948401,
463
+ "loss": 1.5635,
464
+ "step": 320
465
+ },
466
+ {
467
+ "epoch": 1.4192139737991267,
468
+ "grad_norm": 0.4640302354831066,
469
+ "learning_rate": 0.00012661768510190816,
470
+ "loss": 1.4904,
471
+ "step": 325
472
+ },
473
+ {
474
+ "epoch": 1.4410480349344978,
475
+ "grad_norm": 0.45432337967027536,
476
+ "learning_rate": 0.00012415930495746302,
477
+ "loss": 1.5045,
478
+ "step": 330
479
+ },
480
+ {
481
+ "epoch": 1.462882096069869,
482
+ "grad_norm": 0.451657842641418,
483
+ "learning_rate": 0.00012168531765388755,
484
+ "loss": 1.392,
485
+ "step": 335
486
+ },
487
+ {
488
+ "epoch": 1.48471615720524,
489
+ "grad_norm": 0.43285118470181244,
490
+ "learning_rate": 0.00011919732141238898,
491
+ "loss": 1.461,
492
+ "step": 340
493
+ },
494
+ {
495
+ "epoch": 1.5065502183406112,
496
+ "grad_norm": 0.4819434243733131,
497
+ "learning_rate": 0.00011669692350409223,
498
+ "loss": 1.6045,
499
+ "step": 345
500
+ },
501
+ {
502
+ "epoch": 1.5283842794759825,
503
+ "grad_norm": 0.440140827625262,
504
+ "learning_rate": 0.00011418573921172635,
505
+ "loss": 1.5039,
506
+ "step": 350
507
+ },
508
+ {
509
+ "epoch": 1.5502183406113537,
510
+ "grad_norm": 0.4475130341158297,
511
+ "learning_rate": 0.00011166539078613525,
512
+ "loss": 1.5163,
513
+ "step": 355
514
+ },
515
+ {
516
+ "epoch": 1.572052401746725,
517
+ "grad_norm": 0.4563951417730544,
518
+ "learning_rate": 0.00010913750639828711,
519
+ "loss": 1.4896,
520
+ "step": 360
521
+ },
522
+ {
523
+ "epoch": 1.5938864628820961,
524
+ "grad_norm": 0.4570029980279654,
525
+ "learning_rate": 0.0001066037190874591,
526
+ "loss": 1.4318,
527
+ "step": 365
528
+ },
529
+ {
530
+ "epoch": 1.6157205240174672,
531
+ "grad_norm": 0.45220265285594413,
532
+ "learning_rate": 0.00010406566570627713,
533
+ "loss": 1.484,
534
+ "step": 370
535
+ },
536
+ {
537
+ "epoch": 1.6375545851528384,
538
+ "grad_norm": 0.43217867151367745,
539
+ "learning_rate": 0.0001015249858632926,
540
+ "loss": 1.3569,
541
+ "step": 375
542
+ },
543
+ {
544
+ "epoch": 1.6593886462882095,
545
+ "grad_norm": 0.45524805316851796,
546
+ "learning_rate": 9.898332086377805e-05,
547
+ "loss": 1.5309,
548
+ "step": 380
549
+ },
550
+ {
551
+ "epoch": 1.6812227074235808,
552
+ "grad_norm": 0.45864967058418454,
553
+ "learning_rate": 9.644231264942724e-05,
554
+ "loss": 1.5568,
555
+ "step": 385
556
+ },
557
+ {
558
+ "epoch": 1.703056768558952,
559
+ "grad_norm": 0.46156320050595717,
560
+ "learning_rate": 9.390360273764411e-05,
561
+ "loss": 1.5541,
562
+ "step": 390
563
+ },
564
+ {
565
+ "epoch": 1.7248908296943233,
566
+ "grad_norm": 0.4424924245408702,
567
+ "learning_rate": 9.136883116110542e-05,
568
+ "loss": 1.4779,
569
+ "step": 395
570
+ },
571
+ {
572
+ "epoch": 1.7467248908296944,
573
+ "grad_norm": 0.4631844693338652,
574
+ "learning_rate": 8.88396354082829e-05,
575
+ "loss": 1.5041,
576
+ "step": 400
577
+ },
578
+ {
579
+ "epoch": 1.7685589519650655,
580
+ "grad_norm": 0.49717848531939474,
581
+ "learning_rate": 8.6317649365609e-05,
582
+ "loss": 1.4976,
583
+ "step": 405
584
+ },
585
+ {
586
+ "epoch": 1.7903930131004366,
587
+ "grad_norm": 0.44110942790393187,
588
+ "learning_rate": 8.380450226196925e-05,
589
+ "loss": 1.3881,
590
+ "step": 410
591
+ },
592
+ {
593
+ "epoch": 1.8122270742358078,
594
+ "grad_norm": 0.5089589581007182,
595
+ "learning_rate": 8.130181761620392e-05,
596
+ "loss": 1.4779,
597
+ "step": 415
598
+ },
599
+ {
600
+ "epoch": 1.8340611353711789,
601
+ "grad_norm": 0.4698355464759574,
602
+ "learning_rate": 7.881121218829787e-05,
603
+ "loss": 1.4198,
604
+ "step": 420
605
+ },
606
+ {
607
+ "epoch": 1.8558951965065502,
608
+ "grad_norm": 0.4659031789449208,
609
+ "learning_rate": 7.63342949349373e-05,
610
+ "loss": 1.4861,
611
+ "step": 425
612
+ },
613
+ {
614
+ "epoch": 1.8777292576419216,
615
+ "grad_norm": 0.44894777329606517,
616
+ "learning_rate": 7.387266597010704e-05,
617
+ "loss": 1.503,
618
+ "step": 430
619
+ },
620
+ {
621
+ "epoch": 1.8995633187772927,
622
+ "grad_norm": 0.4475374854045599,
623
+ "learning_rate": 7.142791553140045e-05,
624
+ "loss": 1.5077,
625
+ "step": 435
626
+ },
627
+ {
628
+ "epoch": 1.9213973799126638,
629
+ "grad_norm": 0.4750409585386964,
630
+ "learning_rate": 6.900162295270968e-05,
631
+ "loss": 1.515,
632
+ "step": 440
633
+ },
634
+ {
635
+ "epoch": 1.943231441048035,
636
+ "grad_norm": 0.4786776337696274,
637
+ "learning_rate": 6.659535564395982e-05,
638
+ "loss": 1.5167,
639
+ "step": 445
640
+ },
641
+ {
642
+ "epoch": 1.965065502183406,
643
+ "grad_norm": 0.5012494244267932,
644
+ "learning_rate": 6.421066807854584e-05,
645
+ "loss": 1.5364,
646
+ "step": 450
647
+ },
648
+ {
649
+ "epoch": 1.9868995633187772,
650
+ "grad_norm": 0.445678005467061,
651
+ "learning_rate": 6.184910078912687e-05,
652
+ "loss": 1.4215,
653
+ "step": 455
654
+ },
655
+ {
656
+ "epoch": 2.0087336244541483,
657
+ "grad_norm": 0.4871967245962684,
658
+ "learning_rate": 5.9512179372426325e-05,
659
+ "loss": 1.4481,
660
+ "step": 460
661
+ },
662
+ {
663
+ "epoch": 2.03056768558952,
664
+ "grad_norm": 0.48019280605703346,
665
+ "learning_rate": 5.720141350368072e-05,
666
+ "loss": 1.45,
667
+ "step": 465
668
+ },
669
+ {
670
+ "epoch": 2.052401746724891,
671
+ "grad_norm": 0.5224256236632303,
672
+ "learning_rate": 5.4918295961373923e-05,
673
+ "loss": 1.4061,
674
+ "step": 470
675
+ },
676
+ {
677
+ "epoch": 2.074235807860262,
678
+ "grad_norm": 0.5108294729537572,
679
+ "learning_rate": 5.266430166288705e-05,
680
+ "loss": 1.3943,
681
+ "step": 475
682
+ },
683
+ {
684
+ "epoch": 2.096069868995633,
685
+ "grad_norm": 0.5371496292909286,
686
+ "learning_rate": 5.044088671168644e-05,
687
+ "loss": 1.3578,
688
+ "step": 480
689
+ },
690
+ {
691
+ "epoch": 2.1179039301310043,
692
+ "grad_norm": 0.5481243225476391,
693
+ "learning_rate": 4.824948745666621e-05,
694
+ "loss": 1.3686,
695
+ "step": 485
696
+ },
697
+ {
698
+ "epoch": 2.1397379912663754,
699
+ "grad_norm": 0.5490865552703573,
700
+ "learning_rate": 4.6091519564251793e-05,
701
+ "loss": 1.3655,
702
+ "step": 490
703
+ },
704
+ {
705
+ "epoch": 2.1615720524017465,
706
+ "grad_norm": 0.5341608991319753,
707
+ "learning_rate": 4.3968377103865024e-05,
708
+ "loss": 1.3681,
709
+ "step": 495
710
+ },
711
+ {
712
+ "epoch": 2.183406113537118,
713
+ "grad_norm": 0.5539995610447883,
714
+ "learning_rate": 4.1881431647341054e-05,
715
+ "loss": 1.3703,
716
+ "step": 500
717
+ },
718
+ {
719
+ "epoch": 2.2052401746724892,
720
+ "grad_norm": 0.5617916414503629,
721
+ "learning_rate": 3.9832031382878766e-05,
722
+ "loss": 1.3506,
723
+ "step": 505
724
+ },
725
+ {
726
+ "epoch": 2.2270742358078603,
727
+ "grad_norm": 0.5457137690561883,
728
+ "learning_rate": 3.7821500244097274e-05,
729
+ "loss": 1.33,
730
+ "step": 510
731
+ },
732
+ {
733
+ "epoch": 2.2489082969432315,
734
+ "grad_norm": 0.5996161081223403,
735
+ "learning_rate": 3.585113705476143e-05,
736
+ "loss": 1.389,
737
+ "step": 515
738
+ },
739
+ {
740
+ "epoch": 2.2707423580786026,
741
+ "grad_norm": 0.5451761149567781,
742
+ "learning_rate": 3.392221468972805e-05,
743
+ "loss": 1.3908,
744
+ "step": 520
745
+ },
746
+ {
747
+ "epoch": 2.2925764192139737,
748
+ "grad_norm": 0.6118020705146557,
749
+ "learning_rate": 3.203597925265598e-05,
750
+ "loss": 1.3559,
751
+ "step": 525
752
+ },
753
+ {
754
+ "epoch": 2.314410480349345,
755
+ "grad_norm": 0.5671066923036526,
756
+ "learning_rate": 3.0193649271010095e-05,
757
+ "loss": 1.3478,
758
+ "step": 530
759
+ },
760
+ {
761
+ "epoch": 2.3362445414847164,
762
+ "grad_norm": 0.6001118016692729,
763
+ "learning_rate": 2.8396414908880098e-05,
764
+ "loss": 1.3509,
765
+ "step": 535
766
+ },
767
+ {
768
+ "epoch": 2.3580786026200875,
769
+ "grad_norm": 0.5894837707963337,
770
+ "learning_rate": 2.6645437198122502e-05,
771
+ "loss": 1.4214,
772
+ "step": 540
773
+ },
774
+ {
775
+ "epoch": 2.3799126637554586,
776
+ "grad_norm": 0.5845209440156467,
777
+ "learning_rate": 2.4941847288321797e-05,
778
+ "loss": 1.3788,
779
+ "step": 545
780
+ },
781
+ {
782
+ "epoch": 2.4017467248908297,
783
+ "grad_norm": 0.5895995554814336,
784
+ "learning_rate": 2.328674571605637e-05,
785
+ "loss": 1.391,
786
+ "step": 550
787
+ },
788
+ {
789
+ "epoch": 2.423580786026201,
790
+ "grad_norm": 0.6008227175127105,
791
+ "learning_rate": 2.1681201693940668e-05,
792
+ "loss": 1.4373,
793
+ "step": 555
794
+ },
795
+ {
796
+ "epoch": 2.445414847161572,
797
+ "grad_norm": 0.6000811157919661,
798
+ "learning_rate": 2.0126252419902614e-05,
799
+ "loss": 1.4406,
800
+ "step": 560
801
+ },
802
+ {
803
+ "epoch": 2.467248908296943,
804
+ "grad_norm": 0.5911207625229586,
805
+ "learning_rate": 1.8622902407143394e-05,
806
+ "loss": 1.5141,
807
+ "step": 565
808
+ },
809
+ {
810
+ "epoch": 2.489082969432314,
811
+ "grad_norm": 0.615377574511787,
812
+ "learning_rate": 1.7172122835211337e-05,
813
+ "loss": 1.3896,
814
+ "step": 570
815
+ },
816
+ {
817
+ "epoch": 2.5109170305676853,
818
+ "grad_norm": 0.5857047240018323,
819
+ "learning_rate": 1.577485092261012e-05,
820
+ "loss": 1.4468,
821
+ "step": 575
822
+ },
823
+ {
824
+ "epoch": 2.532751091703057,
825
+ "grad_norm": 0.6061359574672782,
826
+ "learning_rate": 1.4431989321345974e-05,
827
+ "loss": 1.4299,
828
+ "step": 580
829
+ },
830
+ {
831
+ "epoch": 2.554585152838428,
832
+ "grad_norm": 0.6360146523250017,
833
+ "learning_rate": 1.3144405533805138e-05,
834
+ "loss": 1.4552,
835
+ "step": 585
836
+ },
837
+ {
838
+ "epoch": 2.576419213973799,
839
+ "grad_norm": 0.6147809358329509,
840
+ "learning_rate": 1.191293135233844e-05,
841
+ "loss": 1.3359,
842
+ "step": 590
843
+ },
844
+ {
845
+ "epoch": 2.5982532751091703,
846
+ "grad_norm": 0.6021229536534441,
847
+ "learning_rate": 1.0738362321914997e-05,
848
+ "loss": 1.3927,
849
+ "step": 595
850
+ },
851
+ {
852
+ "epoch": 2.6200873362445414,
853
+ "grad_norm": 0.5961109142620634,
854
+ "learning_rate": 9.62145722619182e-06,
855
+ "loss": 1.4896,
856
+ "step": 600
857
+ },
858
+ {
859
+ "epoch": 2.641921397379913,
860
+ "grad_norm": 0.578602732768201,
861
+ "learning_rate": 8.562937597331899e-06,
862
+ "loss": 1.4565,
863
+ "step": 605
864
+ },
865
+ {
866
+ "epoch": 2.6637554585152836,
867
+ "grad_norm": 0.6458615704409548,
868
+ "learning_rate": 7.563487249887024e-06,
869
+ "loss": 1.4511,
870
+ "step": 610
871
+ },
872
+ {
873
+ "epoch": 2.685589519650655,
874
+ "grad_norm": 0.6022868302443363,
875
+ "learning_rate": 6.623751839046455e-06,
876
+ "loss": 1.3836,
877
+ "step": 615
878
+ },
879
+ {
880
+ "epoch": 2.7074235807860263,
881
+ "grad_norm": 0.5988174075228108,
882
+ "learning_rate": 5.744338443537134e-06,
883
+ "loss": 1.4891,
884
+ "step": 620
885
+ },
886
+ {
887
+ "epoch": 2.7292576419213974,
888
+ "grad_norm": 0.6026020303199385,
889
+ "learning_rate": 4.92581517344457e-06,
890
+ "loss": 1.3409,
891
+ "step": 625
892
+ },
893
+ {
894
+ "epoch": 2.7510917030567685,
895
+ "grad_norm": 0.5919990549417811,
896
+ "learning_rate": 4.168710803207865e-06,
897
+ "loss": 1.4157,
898
+ "step": 630
899
+ },
900
+ {
901
+ "epoch": 2.7729257641921397,
902
+ "grad_norm": 0.6038824964947008,
903
+ "learning_rate": 3.473514430026026e-06,
904
+ "loss": 1.4138,
905
+ "step": 635
906
+ },
907
+ {
908
+ "epoch": 2.7947598253275108,
909
+ "grad_norm": 0.6135553904485512,
910
+ "learning_rate": 2.840675157896111e-06,
911
+ "loss": 1.411,
912
+ "step": 640
913
+ },
914
+ {
915
+ "epoch": 2.816593886462882,
916
+ "grad_norm": 0.6332119474956278,
917
+ "learning_rate": 2.2706018074875045e-06,
918
+ "loss": 1.2871,
919
+ "step": 645
920
+ },
921
+ {
922
+ "epoch": 2.8384279475982535,
923
+ "grad_norm": 0.607982963030175,
924
+ "learning_rate": 1.7636626520395105e-06,
925
+ "loss": 1.4341,
926
+ "step": 650
927
+ },
928
+ {
929
+ "epoch": 2.8602620087336246,
930
+ "grad_norm": 0.6326939787843192,
931
+ "learning_rate": 1.3201851794530373e-06,
932
+ "loss": 1.3992,
933
+ "step": 655
934
+ },
935
+ {
936
+ "epoch": 2.8820960698689957,
937
+ "grad_norm": 0.6624316672369768,
938
+ "learning_rate": 9.404558807301067e-07,
939
+ "loss": 1.424,
940
+ "step": 660
941
+ },
942
+ {
943
+ "epoch": 2.903930131004367,
944
+ "grad_norm": 0.5890404802190057,
945
+ "learning_rate": 6.247200648976991e-07,
946
+ "loss": 1.3734,
947
+ "step": 665
948
+ },
949
+ {
950
+ "epoch": 2.925764192139738,
951
+ "grad_norm": 0.588934601286785,
952
+ "learning_rate": 3.7318170053559644e-07,
953
+ "loss": 1.413,
954
+ "step": 670
955
+ },
956
+ {
957
+ "epoch": 2.947598253275109,
958
+ "grad_norm": 0.6171879119635909,
959
+ "learning_rate": 1.8600328401061629e-07,
960
+ "loss": 1.3424,
961
+ "step": 675
962
+ },
963
+ {
964
+ "epoch": 2.96943231441048,
965
+ "grad_norm": 0.6328278329175181,
966
+ "learning_rate": 6.33057345022281e-08,
967
+ "loss": 1.3699,
968
+ "step": 680
969
+ },
970
+ {
971
+ "epoch": 2.9912663755458517,
972
+ "grad_norm": 0.609229235169678,
973
+ "learning_rate": 5.1683158875937e-09,
974
+ "loss": 1.3262,
975
+ "step": 685
976
+ },
977
+ {
978
+ "epoch": 3.0,
979
+ "step": 687,
980
+ "total_flos": 1562393690767360.0,
981
+ "train_loss": 1.5581738730184898,
982
+ "train_runtime": 1339.3383,
983
+ "train_samples_per_second": 32.797,
984
+ "train_steps_per_second": 0.513
985
+ }
986
+ ],
987
+ "logging_steps": 5,
988
+ "max_steps": 687,
989
+ "num_input_tokens_seen": 0,
990
+ "num_train_epochs": 3,
991
+ "save_steps": 100,
992
+ "stateful_callbacks": {
993
+ "TrainerControl": {
994
+ "args": {
995
+ "should_epoch_stop": false,
996
+ "should_evaluate": false,
997
+ "should_log": false,
998
+ "should_save": true,
999
+ "should_training_stop": true
1000
+ },
1001
+ "attributes": {}
1002
+ }
1003
+ },
1004
+ "total_flos": 1562393690767360.0,
1005
+ "train_batch_size": 8,
1006
+ "trial_name": null,
1007
+ "trial_params": null
1008
+ }