nrshoudi commited on
Commit
9786f37
1 Parent(s): efb2694

End of training

Browse files
Files changed (3) hide show
  1. README.md +11 -11
  2. adapter_1/adapter_model.safetensors +1 -1
  3. trainer_state.json +693 -693
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.2331
20
 
21
  ## Model description
22
 
@@ -49,16 +49,16 @@ The following hyperparameters were used during training:
49
 
50
  | Training Loss | Epoch | Step | Validation Loss |
51
  |:-------------:|:-----:|:----:|:---------------:|
52
- | 0.0608 | 1.0 | 546 | 0.2074 |
53
- | 0.0508 | 2.0 | 1092 | 0.2211 |
54
- | 0.0287 | 3.0 | 1638 | 0.1681 |
55
- | 0.0148 | 4.0 | 2184 | 0.1938 |
56
- | 0.0263 | 5.0 | 2730 | 0.1846 |
57
- | 0.0168 | 6.0 | 3276 | 0.1899 |
58
- | 0.0086 | 7.0 | 3822 | 0.1975 |
59
- | 0.0102 | 8.0 | 4368 | 0.2170 |
60
- | 0.0023 | 9.0 | 4914 | 0.2294 |
61
- | 0.0024 | 10.0 | 5460 | 0.2331 |
62
 
63
 
64
  ### Framework versions
 
16
 
17
  This model is a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.2304
20
 
21
  ## Model description
22
 
 
49
 
50
  | Training Loss | Epoch | Step | Validation Loss |
51
  |:-------------:|:-----:|:----:|:---------------:|
52
+ | 0.0416 | 1.0 | 546 | 0.2269 |
53
+ | 0.0243 | 2.0 | 1092 | 0.2054 |
54
+ | 0.0262 | 3.0 | 1638 | 0.1866 |
55
+ | 0.009 | 4.0 | 2184 | 0.2000 |
56
+ | 0.0196 | 5.0 | 2730 | 0.1928 |
57
+ | 0.0071 | 6.0 | 3276 | 0.2099 |
58
+ | 0.0054 | 7.0 | 3822 | 0.2070 |
59
+ | 0.0066 | 8.0 | 4368 | 0.2189 |
60
+ | 0.0006 | 9.0 | 4914 | 0.2325 |
61
+ | 0.001 | 10.0 | 5460 | 0.2304 |
62
 
63
 
64
  ### Framework versions
adapter_1/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08026702b7a1be7021b6a8e05de062904a5b406498ebe8c9213965d21f3676a2
3
  size 62969640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cf2ca9e94a0f5343ee63a5316cc4fe761e947a9d6e7ab6fb279ed9b99b3f4f5
3
  size 62969640
trainer_state.json CHANGED
@@ -10,1617 +10,1617 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
- "grad_norm": 1.0899150371551514,
14
  "learning_rate": 0.0005,
15
- "loss": 2.9446,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.09,
20
- "grad_norm": 0.7281716465950012,
21
  "learning_rate": 0.001,
22
- "loss": 0.7356,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.14,
27
- "grad_norm": 0.8861620426177979,
28
- "learning_rate": 0.0009963031423290203,
29
- "loss": 0.5765,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.18,
34
- "grad_norm": 2.8832926750183105,
35
- "learning_rate": 0.0009916820702402958,
36
- "loss": 0.3944,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.23,
41
- "grad_norm": 1.077749252319336,
42
- "learning_rate": 0.000987060998151571,
43
- "loss": 0.1654,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.27,
48
- "grad_norm": 0.5631522536277771,
49
- "learning_rate": 0.0009824399260628465,
50
- "loss": 0.1044,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.32,
55
- "grad_norm": 0.25788772106170654,
56
- "learning_rate": 0.000977818853974122,
57
- "loss": 0.0853,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.37,
62
- "grad_norm": 0.7229248285293579,
63
- "learning_rate": 0.0009731977818853974,
64
- "loss": 0.088,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 0.41,
69
- "grad_norm": 0.39239415526390076,
70
- "learning_rate": 0.0009685767097966729,
71
- "loss": 0.0781,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 0.46,
76
- "grad_norm": 0.23850314319133759,
77
- "learning_rate": 0.0009639556377079483,
78
- "loss": 0.0729,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 0.5,
83
- "grad_norm": 0.17475274205207825,
84
- "learning_rate": 0.0009593345656192237,
85
- "loss": 0.0671,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 0.55,
90
- "grad_norm": 0.7980681657791138,
91
- "learning_rate": 0.0009547134935304991,
92
- "loss": 0.0836,
93
  "step": 300
94
  },
95
  {
96
  "epoch": 0.6,
97
- "grad_norm": 0.20785190165042877,
98
- "learning_rate": 0.0009500924214417745,
99
- "loss": 0.0717,
100
  "step": 325
101
  },
102
  {
103
  "epoch": 0.64,
104
- "grad_norm": 0.22275149822235107,
105
- "learning_rate": 0.0009454713493530499,
106
- "loss": 0.0469,
107
  "step": 350
108
  },
109
  {
110
  "epoch": 0.69,
111
- "grad_norm": 0.39296770095825195,
112
- "learning_rate": 0.0009408502772643253,
113
- "loss": 0.0815,
114
  "step": 375
115
  },
116
  {
117
  "epoch": 0.73,
118
- "grad_norm": 0.3089096248149872,
119
- "learning_rate": 0.0009362292051756007,
120
- "loss": 0.0668,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 0.78,
125
- "grad_norm": 0.343288779258728,
126
- "learning_rate": 0.0009316081330868762,
127
- "loss": 0.0481,
128
  "step": 425
129
  },
130
  {
131
  "epoch": 0.82,
132
- "grad_norm": 0.1959904432296753,
133
- "learning_rate": 0.0009269870609981515,
134
- "loss": 0.0554,
135
  "step": 450
136
  },
137
  {
138
  "epoch": 0.87,
139
- "grad_norm": 0.5835228562355042,
140
- "learning_rate": 0.000922365988909427,
141
- "loss": 0.079,
142
  "step": 475
143
  },
144
  {
145
  "epoch": 0.92,
146
- "grad_norm": 0.1368131786584854,
147
- "learning_rate": 0.0009177449168207024,
148
- "loss": 0.0575,
149
  "step": 500
150
  },
151
  {
152
  "epoch": 0.96,
153
- "grad_norm": 0.24407006800174713,
154
- "learning_rate": 0.0009131238447319779,
155
- "loss": 0.0608,
156
  "step": 525
157
  },
158
  {
159
  "epoch": 1.0,
160
- "eval_loss": 0.2074204683303833,
161
- "eval_runtime": 457.9814,
162
- "eval_samples_per_second": 1.775,
163
- "eval_steps_per_second": 0.297,
164
  "step": 546
165
  },
166
  {
167
  "epoch": 1.01,
168
- "grad_norm": 0.12714482843875885,
169
- "learning_rate": 0.0009085027726432532,
170
- "loss": 0.0468,
171
  "step": 550
172
  },
173
  {
174
  "epoch": 1.05,
175
- "grad_norm": 0.3958694636821747,
176
- "learning_rate": 0.0009038817005545287,
177
- "loss": 0.0486,
178
  "step": 575
179
  },
180
  {
181
  "epoch": 1.1,
182
- "grad_norm": 0.0621350072324276,
183
- "learning_rate": 0.0008992606284658041,
184
- "loss": 0.0531,
185
  "step": 600
186
  },
187
  {
188
  "epoch": 1.14,
189
- "grad_norm": 0.019190674647688866,
190
- "learning_rate": 0.0008946395563770795,
191
- "loss": 0.0404,
192
  "step": 625
193
  },
194
  {
195
  "epoch": 1.19,
196
- "grad_norm": 0.10872188955545425,
197
- "learning_rate": 0.000890018484288355,
198
- "loss": 0.0516,
199
  "step": 650
200
  },
201
  {
202
  "epoch": 1.24,
203
- "grad_norm": 0.21620219945907593,
204
- "learning_rate": 0.0008853974121996303,
205
- "loss": 0.0352,
206
  "step": 675
207
  },
208
  {
209
  "epoch": 1.28,
210
- "grad_norm": 0.09226030856370926,
211
- "learning_rate": 0.0008807763401109058,
212
- "loss": 0.046,
213
  "step": 700
214
  },
215
  {
216
  "epoch": 1.33,
217
- "grad_norm": 0.28338831663131714,
218
- "learning_rate": 0.0008761552680221812,
219
- "loss": 0.0541,
220
  "step": 725
221
  },
222
  {
223
  "epoch": 1.37,
224
- "grad_norm": 0.29886457324028015,
225
- "learning_rate": 0.0008715341959334566,
226
- "loss": 0.0469,
227
  "step": 750
228
  },
229
  {
230
  "epoch": 1.42,
231
- "grad_norm": 0.18679702281951904,
232
- "learning_rate": 0.000866913123844732,
233
- "loss": 0.0479,
234
  "step": 775
235
  },
236
  {
237
  "epoch": 1.47,
238
- "grad_norm": 0.37105533480644226,
239
- "learning_rate": 0.0008622920517560074,
240
- "loss": 0.0482,
241
  "step": 800
242
  },
243
  {
244
  "epoch": 1.51,
245
- "grad_norm": 0.2705371081829071,
246
- "learning_rate": 0.0008576709796672828,
247
- "loss": 0.0339,
248
  "step": 825
249
  },
250
  {
251
  "epoch": 1.56,
252
- "grad_norm": 0.23306304216384888,
253
- "learning_rate": 0.0008530499075785582,
254
- "loss": 0.0333,
255
  "step": 850
256
  },
257
  {
258
  "epoch": 1.6,
259
- "grad_norm": 0.32292288541793823,
260
- "learning_rate": 0.0008484288354898336,
261
- "loss": 0.0631,
262
  "step": 875
263
  },
264
  {
265
  "epoch": 1.65,
266
- "grad_norm": 1.583330512046814,
267
- "learning_rate": 0.0008438077634011091,
268
- "loss": 0.0554,
269
  "step": 900
270
  },
271
  {
272
  "epoch": 1.69,
273
- "grad_norm": 0.2681499421596527,
274
- "learning_rate": 0.0008391866913123844,
275
- "loss": 0.0752,
276
  "step": 925
277
  },
278
  {
279
  "epoch": 1.74,
280
- "grad_norm": 0.49241504073143005,
281
- "learning_rate": 0.0008345656192236599,
282
- "loss": 0.0393,
283
  "step": 950
284
  },
285
  {
286
  "epoch": 1.79,
287
- "grad_norm": 0.3337648808956146,
288
- "learning_rate": 0.0008299445471349352,
289
- "loss": 0.0774,
290
  "step": 975
291
  },
292
  {
293
  "epoch": 1.83,
294
- "grad_norm": 0.3321288228034973,
295
- "learning_rate": 0.0008253234750462108,
296
- "loss": 0.0513,
297
  "step": 1000
298
  },
299
  {
300
  "epoch": 1.88,
301
- "grad_norm": 0.03459502011537552,
302
- "learning_rate": 0.0008207024029574861,
303
- "loss": 0.0389,
304
  "step": 1025
305
  },
306
  {
307
  "epoch": 1.92,
308
- "grad_norm": 0.3316815197467804,
309
- "learning_rate": 0.0008160813308687616,
310
- "loss": 0.0556,
311
  "step": 1050
312
  },
313
  {
314
  "epoch": 1.97,
315
- "grad_norm": 0.3402951657772064,
316
- "learning_rate": 0.000811460258780037,
317
- "loss": 0.0508,
318
  "step": 1075
319
  },
320
  {
321
  "epoch": 2.0,
322
- "eval_loss": 0.22112932801246643,
323
- "eval_runtime": 457.2021,
324
- "eval_samples_per_second": 1.778,
325
- "eval_steps_per_second": 0.297,
326
  "step": 1092
327
  },
328
  {
329
  "epoch": 2.01,
330
- "grad_norm": 0.23782561719417572,
331
- "learning_rate": 0.0008068391866913124,
332
- "loss": 0.0493,
333
  "step": 1100
334
  },
335
  {
336
  "epoch": 2.06,
337
- "grad_norm": 1.8971781730651855,
338
- "learning_rate": 0.0008022181146025879,
339
- "loss": 0.0332,
340
  "step": 1125
341
  },
342
  {
343
  "epoch": 2.11,
344
- "grad_norm": 0.3028140664100647,
345
- "learning_rate": 0.0007975970425138632,
346
- "loss": 0.0718,
347
  "step": 1150
348
  },
349
  {
350
  "epoch": 2.15,
351
- "grad_norm": 0.9446136355400085,
352
- "learning_rate": 0.0007929759704251387,
353
- "loss": 0.0541,
354
  "step": 1175
355
  },
356
  {
357
  "epoch": 2.2,
358
- "grad_norm": 0.1498022973537445,
359
- "learning_rate": 0.000788354898336414,
360
- "loss": 0.0383,
361
  "step": 1200
362
  },
363
  {
364
  "epoch": 2.24,
365
- "grad_norm": 0.2323082983493805,
366
- "learning_rate": 0.0007837338262476895,
367
- "loss": 0.0326,
368
  "step": 1225
369
  },
370
  {
371
  "epoch": 2.29,
372
- "grad_norm": 0.15694022178649902,
373
- "learning_rate": 0.0007791127541589649,
374
- "loss": 0.0743,
375
  "step": 1250
376
  },
377
  {
378
  "epoch": 2.34,
379
- "grad_norm": 0.18881258368492126,
380
- "learning_rate": 0.0007744916820702403,
381
- "loss": 0.0398,
382
  "step": 1275
383
  },
384
  {
385
  "epoch": 2.38,
386
- "grad_norm": 0.5623793005943298,
387
- "learning_rate": 0.0007698706099815157,
388
- "loss": 0.0375,
389
  "step": 1300
390
  },
391
  {
392
  "epoch": 2.43,
393
- "grad_norm": 0.5120436549186707,
394
- "learning_rate": 0.0007652495378927911,
395
- "loss": 0.0303,
396
  "step": 1325
397
  },
398
  {
399
  "epoch": 2.47,
400
- "grad_norm": 0.22715122997760773,
401
- "learning_rate": 0.0007606284658040665,
402
- "loss": 0.0324,
403
  "step": 1350
404
  },
405
  {
406
  "epoch": 2.52,
407
- "grad_norm": 0.235545352101326,
408
- "learning_rate": 0.000756007393715342,
409
- "loss": 0.0303,
410
  "step": 1375
411
  },
412
  {
413
  "epoch": 2.56,
414
- "grad_norm": 0.29885298013687134,
415
- "learning_rate": 0.0007513863216266173,
416
- "loss": 0.0302,
417
  "step": 1400
418
  },
419
  {
420
  "epoch": 2.61,
421
- "grad_norm": 0.30983462929725647,
422
- "learning_rate": 0.0007467652495378928,
423
- "loss": 0.0306,
424
  "step": 1425
425
  },
426
  {
427
  "epoch": 2.66,
428
- "grad_norm": 0.3211610019207001,
429
- "learning_rate": 0.0007421441774491681,
430
- "loss": 0.0223,
431
  "step": 1450
432
  },
433
  {
434
  "epoch": 2.7,
435
- "grad_norm": 0.020555827766656876,
436
- "learning_rate": 0.0007375231053604437,
437
- "loss": 0.0236,
438
  "step": 1475
439
  },
440
  {
441
  "epoch": 2.75,
442
- "grad_norm": 0.3003218472003937,
443
- "learning_rate": 0.000732902033271719,
444
- "loss": 0.0292,
445
  "step": 1500
446
  },
447
  {
448
  "epoch": 2.79,
449
- "grad_norm": 0.23368410766124725,
450
- "learning_rate": 0.0007282809611829945,
451
- "loss": 0.035,
452
  "step": 1525
453
  },
454
  {
455
  "epoch": 2.84,
456
- "grad_norm": 0.24805304408073425,
457
- "learning_rate": 0.00072365988909427,
458
- "loss": 0.0168,
459
  "step": 1550
460
  },
461
  {
462
  "epoch": 2.88,
463
- "grad_norm": 0.1878635734319687,
464
- "learning_rate": 0.0007190388170055453,
465
- "loss": 0.0277,
466
  "step": 1575
467
  },
468
  {
469
  "epoch": 2.93,
470
- "grad_norm": 0.3910519480705261,
471
- "learning_rate": 0.0007144177449168208,
472
- "loss": 0.0317,
473
  "step": 1600
474
  },
475
  {
476
  "epoch": 2.98,
477
- "grad_norm": 0.12426433712244034,
478
- "learning_rate": 0.0007097966728280961,
479
- "loss": 0.0287,
480
  "step": 1625
481
  },
482
  {
483
  "epoch": 3.0,
484
- "eval_loss": 0.16812074184417725,
485
- "eval_runtime": 461.1656,
486
- "eval_samples_per_second": 1.763,
487
- "eval_steps_per_second": 0.295,
488
  "step": 1638
489
  },
490
  {
491
  "epoch": 3.02,
492
- "grad_norm": 0.009334838949143887,
493
- "learning_rate": 0.0007051756007393716,
494
- "loss": 0.0203,
495
  "step": 1650
496
  },
497
  {
498
  "epoch": 3.07,
499
- "grad_norm": 0.15624983608722687,
500
- "learning_rate": 0.000700554528650647,
501
- "loss": 0.0222,
502
  "step": 1675
503
  },
504
  {
505
  "epoch": 3.11,
506
- "grad_norm": 0.016200415790081024,
507
- "learning_rate": 0.0006959334565619224,
508
- "loss": 0.0157,
509
  "step": 1700
510
  },
511
  {
512
  "epoch": 3.16,
513
- "grad_norm": 0.23733393847942352,
514
- "learning_rate": 0.0006913123844731978,
515
- "loss": 0.0291,
516
  "step": 1725
517
  },
518
  {
519
  "epoch": 3.21,
520
- "grad_norm": 0.3168778419494629,
521
- "learning_rate": 0.0006866913123844732,
522
- "loss": 0.0256,
523
  "step": 1750
524
  },
525
  {
526
  "epoch": 3.25,
527
- "grad_norm": 0.22910478711128235,
528
- "learning_rate": 0.0006820702402957486,
529
- "loss": 0.0211,
530
  "step": 1775
531
  },
532
  {
533
  "epoch": 3.3,
534
- "grad_norm": 0.17075732350349426,
535
- "learning_rate": 0.000677449168207024,
536
- "loss": 0.0267,
537
  "step": 1800
538
  },
539
  {
540
  "epoch": 3.34,
541
- "grad_norm": 0.17666079103946686,
542
- "learning_rate": 0.0006728280961182994,
543
- "loss": 0.0187,
544
  "step": 1825
545
  },
546
  {
547
  "epoch": 3.39,
548
- "grad_norm": 0.020759843289852142,
549
- "learning_rate": 0.0006682070240295749,
550
- "loss": 0.0265,
551
  "step": 1850
552
  },
553
  {
554
  "epoch": 3.43,
555
- "grad_norm": 0.04912843555212021,
556
- "learning_rate": 0.0006635859519408502,
557
- "loss": 0.0359,
558
  "step": 1875
559
  },
560
  {
561
  "epoch": 3.48,
562
- "grad_norm": 0.32245051860809326,
563
- "learning_rate": 0.0006589648798521257,
564
- "loss": 0.0175,
565
  "step": 1900
566
  },
567
  {
568
  "epoch": 3.53,
569
- "grad_norm": 0.11074571311473846,
570
- "learning_rate": 0.000654343807763401,
571
- "loss": 0.0178,
572
  "step": 1925
573
  },
574
  {
575
  "epoch": 3.57,
576
- "grad_norm": 0.03214950114488602,
577
- "learning_rate": 0.0006497227356746766,
578
- "loss": 0.0192,
579
  "step": 1950
580
  },
581
  {
582
  "epoch": 3.62,
583
- "grad_norm": 0.008081772364675999,
584
- "learning_rate": 0.000645101663585952,
585
- "loss": 0.0317,
586
  "step": 1975
587
  },
588
  {
589
  "epoch": 3.66,
590
- "grad_norm": 0.1419508308172226,
591
- "learning_rate": 0.0006404805914972274,
592
- "loss": 0.0297,
593
  "step": 2000
594
  },
595
  {
596
  "epoch": 3.71,
597
- "grad_norm": 0.01866302080452442,
598
- "learning_rate": 0.0006358595194085029,
599
- "loss": 0.0264,
600
  "step": 2025
601
  },
602
  {
603
  "epoch": 3.75,
604
- "grad_norm": 0.0233779214322567,
605
- "learning_rate": 0.0006312384473197782,
606
- "loss": 0.0285,
607
  "step": 2050
608
  },
609
  {
610
  "epoch": 3.8,
611
- "grad_norm": 0.023354342207312584,
612
- "learning_rate": 0.0006266173752310537,
613
- "loss": 0.021,
614
  "step": 2075
615
  },
616
  {
617
  "epoch": 3.85,
618
- "grad_norm": 0.2687942087650299,
619
- "learning_rate": 0.000621996303142329,
620
  "loss": 0.0201,
621
  "step": 2100
622
  },
623
  {
624
  "epoch": 3.89,
625
- "grad_norm": 0.15729880332946777,
626
- "learning_rate": 0.0006173752310536045,
627
- "loss": 0.0433,
628
  "step": 2125
629
  },
630
  {
631
  "epoch": 3.94,
632
- "grad_norm": 0.13736210763454437,
633
- "learning_rate": 0.0006127541589648799,
634
- "loss": 0.0269,
635
  "step": 2150
636
  },
637
  {
638
  "epoch": 3.98,
639
- "grad_norm": 0.17346683144569397,
640
- "learning_rate": 0.0006081330868761553,
641
- "loss": 0.0148,
642
  "step": 2175
643
  },
644
  {
645
  "epoch": 4.0,
646
- "eval_loss": 0.1938144713640213,
647
- "eval_runtime": 460.937,
648
- "eval_samples_per_second": 1.764,
649
- "eval_steps_per_second": 0.295,
650
  "step": 2184
651
  },
652
  {
653
  "epoch": 4.03,
654
- "grad_norm": 0.1308988779783249,
655
- "learning_rate": 0.0006035120147874307,
656
- "loss": 0.0195,
657
  "step": 2200
658
  },
659
  {
660
  "epoch": 4.08,
661
- "grad_norm": 0.21798363327980042,
662
- "learning_rate": 0.0005988909426987061,
663
- "loss": 0.018,
664
  "step": 2225
665
  },
666
  {
667
  "epoch": 4.12,
668
- "grad_norm": 0.059930045157670975,
669
- "learning_rate": 0.0005942698706099815,
670
- "loss": 0.0157,
671
  "step": 2250
672
  },
673
  {
674
  "epoch": 4.17,
675
- "grad_norm": 1.1142582893371582,
676
- "learning_rate": 0.000589648798521257,
677
- "loss": 0.0252,
678
  "step": 2275
679
  },
680
  {
681
  "epoch": 4.21,
682
- "grad_norm": 0.17724983394145966,
683
- "learning_rate": 0.0005850277264325323,
684
- "loss": 0.0251,
685
  "step": 2300
686
  },
687
  {
688
  "epoch": 4.26,
689
- "grad_norm": 0.7539493441581726,
690
- "learning_rate": 0.0005804066543438078,
691
- "loss": 0.0226,
692
  "step": 2325
693
  },
694
  {
695
  "epoch": 4.3,
696
- "grad_norm": 0.15379472076892853,
697
- "learning_rate": 0.0005757855822550831,
698
- "loss": 0.0154,
699
  "step": 2350
700
  },
701
  {
702
  "epoch": 4.35,
703
- "grad_norm": 0.11480142921209335,
704
- "learning_rate": 0.0005711645101663586,
705
- "loss": 0.0302,
706
  "step": 2375
707
  },
708
  {
709
  "epoch": 4.4,
710
- "grad_norm": 0.29920852184295654,
711
- "learning_rate": 0.0005665434380776339,
712
- "loss": 0.0223,
713
  "step": 2400
714
  },
715
  {
716
  "epoch": 4.44,
717
- "grad_norm": 0.2625471353530884,
718
- "learning_rate": 0.0005619223659889095,
719
- "loss": 0.0126,
720
  "step": 2425
721
  },
722
  {
723
  "epoch": 4.49,
724
- "grad_norm": 0.2014468014240265,
725
- "learning_rate": 0.0005573012939001849,
726
- "loss": 0.0262,
727
  "step": 2450
728
  },
729
  {
730
  "epoch": 4.53,
731
- "grad_norm": 0.20631028711795807,
732
- "learning_rate": 0.0005526802218114603,
733
- "loss": 0.0128,
734
  "step": 2475
735
  },
736
  {
737
  "epoch": 4.58,
738
- "grad_norm": 0.1370575875043869,
739
- "learning_rate": 0.0005480591497227358,
740
  "loss": 0.0156,
741
  "step": 2500
742
  },
743
  {
744
  "epoch": 4.62,
745
- "grad_norm": 0.058717742562294006,
746
- "learning_rate": 0.0005434380776340111,
747
- "loss": 0.0162,
748
  "step": 2525
749
  },
750
  {
751
  "epoch": 4.67,
752
- "grad_norm": 0.010219058953225613,
753
- "learning_rate": 0.0005388170055452866,
754
- "loss": 0.0169,
755
  "step": 2550
756
  },
757
  {
758
  "epoch": 4.72,
759
- "grad_norm": 0.1966046839952469,
760
- "learning_rate": 0.0005341959334565619,
761
- "loss": 0.0138,
762
  "step": 2575
763
  },
764
  {
765
  "epoch": 4.76,
766
- "grad_norm": 0.4009633958339691,
767
- "learning_rate": 0.0005295748613678374,
768
- "loss": 0.0219,
769
  "step": 2600
770
  },
771
  {
772
  "epoch": 4.81,
773
- "grad_norm": 0.002215989399701357,
774
- "learning_rate": 0.0005249537892791128,
775
- "loss": 0.0141,
776
  "step": 2625
777
  },
778
  {
779
  "epoch": 4.85,
780
- "grad_norm": 0.31632617115974426,
781
- "learning_rate": 0.0005203327171903882,
782
- "loss": 0.0121,
783
  "step": 2650
784
  },
785
  {
786
  "epoch": 4.9,
787
- "grad_norm": 0.003695544321089983,
788
- "learning_rate": 0.0005157116451016636,
789
- "loss": 0.0152,
790
  "step": 2675
791
  },
792
  {
793
  "epoch": 4.95,
794
- "grad_norm": 0.03792522847652435,
795
- "learning_rate": 0.000511090573012939,
796
- "loss": 0.0248,
797
  "step": 2700
798
  },
799
  {
800
  "epoch": 4.99,
801
- "grad_norm": 0.22871936857700348,
802
- "learning_rate": 0.0005064695009242144,
803
- "loss": 0.0263,
804
  "step": 2725
805
  },
806
  {
807
  "epoch": 5.0,
808
- "eval_loss": 0.1845603585243225,
809
- "eval_runtime": 463.3179,
810
- "eval_samples_per_second": 1.755,
811
- "eval_steps_per_second": 0.294,
812
  "step": 2730
813
  },
814
  {
815
  "epoch": 5.04,
816
- "grad_norm": 0.2171017825603485,
817
- "learning_rate": 0.0005018484288354898,
818
- "loss": 0.0141,
819
  "step": 2750
820
  },
821
  {
822
  "epoch": 5.08,
823
- "grad_norm": 0.0038620266132056713,
824
- "learning_rate": 0.0004972273567467653,
825
- "loss": 0.0058,
826
  "step": 2775
827
  },
828
  {
829
  "epoch": 5.13,
830
- "grad_norm": 0.2298993319272995,
831
- "learning_rate": 0.0004926062846580407,
832
- "loss": 0.0109,
833
  "step": 2800
834
  },
835
  {
836
  "epoch": 5.17,
837
- "grad_norm": 0.11024118214845657,
838
- "learning_rate": 0.0004879852125693161,
839
- "loss": 0.0189,
840
  "step": 2825
841
  },
842
  {
843
  "epoch": 5.22,
844
- "grad_norm": 0.0567074678838253,
845
- "learning_rate": 0.00048336414048059153,
846
- "loss": 0.0089,
847
  "step": 2850
848
  },
849
  {
850
  "epoch": 5.27,
851
- "grad_norm": 0.004749275743961334,
852
- "learning_rate": 0.00047874306839186694,
853
- "loss": 0.0114,
854
  "step": 2875
855
  },
856
  {
857
  "epoch": 5.31,
858
- "grad_norm": 0.14577604830265045,
859
- "learning_rate": 0.00047412199630314235,
860
- "loss": 0.0129,
861
  "step": 2900
862
  },
863
  {
864
  "epoch": 5.36,
865
- "grad_norm": 0.11412041634321213,
866
- "learning_rate": 0.00046950092421441775,
867
- "loss": 0.0127,
868
  "step": 2925
869
  },
870
  {
871
  "epoch": 5.4,
872
- "grad_norm": 0.29708778858184814,
873
- "learning_rate": 0.00046487985212569316,
874
- "loss": 0.0334,
875
  "step": 2950
876
  },
877
  {
878
  "epoch": 5.45,
879
- "grad_norm": 0.12024960666894913,
880
- "learning_rate": 0.00046025878003696857,
881
- "loss": 0.0153,
882
  "step": 2975
883
  },
884
  {
885
  "epoch": 5.49,
886
- "grad_norm": 0.12928707897663116,
887
- "learning_rate": 0.000455637707948244,
888
- "loss": 0.0153,
889
  "step": 3000
890
  },
891
  {
892
  "epoch": 5.54,
893
- "grad_norm": 0.1562725156545639,
894
- "learning_rate": 0.0004510166358595194,
895
- "loss": 0.0174,
896
  "step": 3025
897
  },
898
  {
899
  "epoch": 5.59,
900
- "grad_norm": 0.270773321390152,
901
- "learning_rate": 0.00044639556377079484,
902
- "loss": 0.0221,
903
  "step": 3050
904
  },
905
  {
906
  "epoch": 5.63,
907
- "grad_norm": 0.36300143599510193,
908
- "learning_rate": 0.00044177449168207025,
909
- "loss": 0.0219,
910
  "step": 3075
911
  },
912
  {
913
  "epoch": 5.68,
914
- "grad_norm": 0.1869664192199707,
915
- "learning_rate": 0.00043715341959334566,
916
- "loss": 0.0127,
917
  "step": 3100
918
  },
919
  {
920
  "epoch": 5.72,
921
- "grad_norm": 0.23992718756198883,
922
- "learning_rate": 0.00043253234750462107,
923
- "loss": 0.0144,
924
  "step": 3125
925
  },
926
  {
927
  "epoch": 5.77,
928
- "grad_norm": 0.0021614329889416695,
929
- "learning_rate": 0.00042791127541589647,
930
- "loss": 0.0115,
931
  "step": 3150
932
  },
933
  {
934
  "epoch": 5.82,
935
- "grad_norm": 0.10629579424858093,
936
- "learning_rate": 0.0004232902033271719,
937
- "loss": 0.0097,
938
  "step": 3175
939
  },
940
  {
941
  "epoch": 5.86,
942
- "grad_norm": 0.15990929305553436,
943
- "learning_rate": 0.00041866913123844734,
944
- "loss": 0.0118,
945
  "step": 3200
946
  },
947
  {
948
  "epoch": 5.91,
949
- "grad_norm": 0.19984115660190582,
950
- "learning_rate": 0.00041404805914972275,
951
- "loss": 0.014,
952
  "step": 3225
953
  },
954
  {
955
  "epoch": 5.95,
956
- "grad_norm": 0.06720598042011261,
957
- "learning_rate": 0.00040942698706099816,
958
- "loss": 0.0182,
959
  "step": 3250
960
  },
961
  {
962
  "epoch": 6.0,
963
- "grad_norm": 0.006452410947531462,
964
- "learning_rate": 0.0004048059149722736,
965
- "loss": 0.0168,
966
  "step": 3275
967
  },
968
  {
969
  "epoch": 6.0,
970
- "eval_loss": 0.18991737067699432,
971
- "eval_runtime": 463.6922,
972
- "eval_samples_per_second": 1.753,
973
- "eval_steps_per_second": 0.293,
974
  "step": 3276
975
  },
976
  {
977
  "epoch": 6.04,
978
- "grad_norm": 0.003035791451111436,
979
- "learning_rate": 0.000400184842883549,
980
- "loss": 0.0103,
981
  "step": 3300
982
  },
983
  {
984
  "epoch": 6.09,
985
- "grad_norm": 0.20400433242321014,
986
- "learning_rate": 0.00039556377079482443,
987
- "loss": 0.0094,
988
  "step": 3325
989
  },
990
  {
991
  "epoch": 6.14,
992
- "grad_norm": 0.05333884805440903,
993
- "learning_rate": 0.00039094269870609984,
994
- "loss": 0.0135,
995
  "step": 3350
996
  },
997
  {
998
  "epoch": 6.18,
999
- "grad_norm": 0.0005341291544027627,
1000
- "learning_rate": 0.00038632162661737525,
1001
- "loss": 0.0042,
1002
  "step": 3375
1003
  },
1004
  {
1005
  "epoch": 6.23,
1006
- "grad_norm": 0.008850090205669403,
1007
- "learning_rate": 0.00038170055452865065,
1008
- "loss": 0.012,
1009
  "step": 3400
1010
  },
1011
  {
1012
  "epoch": 6.27,
1013
- "grad_norm": 0.013096541166305542,
1014
- "learning_rate": 0.00037707948243992606,
1015
- "loss": 0.0121,
1016
  "step": 3425
1017
  },
1018
  {
1019
  "epoch": 6.32,
1020
- "grad_norm": 0.04941894859075546,
1021
- "learning_rate": 0.00037245841035120147,
1022
- "loss": 0.0086,
1023
  "step": 3450
1024
  },
1025
  {
1026
  "epoch": 6.36,
1027
- "grad_norm": 0.02113133668899536,
1028
- "learning_rate": 0.0003678373382624769,
1029
- "loss": 0.0109,
1030
  "step": 3475
1031
  },
1032
  {
1033
  "epoch": 6.41,
1034
- "grad_norm": 0.03568890690803528,
1035
- "learning_rate": 0.0003632162661737523,
1036
- "loss": 0.0072,
1037
  "step": 3500
1038
  },
1039
  {
1040
  "epoch": 6.46,
1041
- "grad_norm": 0.0471993163228035,
1042
- "learning_rate": 0.00035859519408502774,
1043
- "loss": 0.0074,
1044
  "step": 3525
1045
  },
1046
  {
1047
  "epoch": 6.5,
1048
- "grad_norm": 0.13575506210327148,
1049
- "learning_rate": 0.00035397412199630315,
1050
- "loss": 0.0084,
1051
  "step": 3550
1052
  },
1053
  {
1054
  "epoch": 6.55,
1055
- "grad_norm": 0.14595919847488403,
1056
- "learning_rate": 0.00034935304990757856,
1057
- "loss": 0.0063,
1058
  "step": 3575
1059
  },
1060
  {
1061
  "epoch": 6.59,
1062
- "grad_norm": 0.012155482545495033,
1063
- "learning_rate": 0.00034473197781885397,
1064
- "loss": 0.0092,
1065
  "step": 3600
1066
  },
1067
  {
1068
  "epoch": 6.64,
1069
- "grad_norm": 0.010629130527377129,
1070
- "learning_rate": 0.0003401109057301294,
1071
- "loss": 0.0134,
1072
  "step": 3625
1073
  },
1074
  {
1075
  "epoch": 6.68,
1076
- "grad_norm": 0.19480323791503906,
1077
- "learning_rate": 0.00033548983364140483,
1078
- "loss": 0.0141,
1079
  "step": 3650
1080
  },
1081
  {
1082
  "epoch": 6.73,
1083
- "grad_norm": 0.0660039409995079,
1084
- "learning_rate": 0.00033086876155268024,
1085
- "loss": 0.0077,
1086
  "step": 3675
1087
  },
1088
  {
1089
  "epoch": 6.78,
1090
- "grad_norm": 0.0346703939139843,
1091
- "learning_rate": 0.00032624768946395565,
1092
- "loss": 0.0117,
1093
  "step": 3700
1094
  },
1095
  {
1096
  "epoch": 6.82,
1097
- "grad_norm": 0.22081167995929718,
1098
- "learning_rate": 0.00032162661737523106,
1099
- "loss": 0.0099,
1100
  "step": 3725
1101
  },
1102
  {
1103
  "epoch": 6.87,
1104
- "grad_norm": 0.06806311756372452,
1105
- "learning_rate": 0.0003170055452865065,
1106
- "loss": 0.0115,
1107
  "step": 3750
1108
  },
1109
  {
1110
  "epoch": 6.91,
1111
- "grad_norm": 0.0030449284240603447,
1112
- "learning_rate": 0.0003123844731977819,
1113
- "loss": 0.0136,
1114
  "step": 3775
1115
  },
1116
  {
1117
  "epoch": 6.96,
1118
- "grad_norm": 0.07848404347896576,
1119
- "learning_rate": 0.00030776340110905733,
1120
- "loss": 0.0086,
1121
  "step": 3800
1122
  },
1123
  {
1124
  "epoch": 7.0,
1125
- "eval_loss": 0.19748586416244507,
1126
- "eval_runtime": 457.6513,
1127
- "eval_samples_per_second": 1.776,
1128
- "eval_steps_per_second": 0.297,
1129
  "step": 3822
1130
  },
1131
  {
1132
  "epoch": 7.01,
1133
- "grad_norm": 0.04702220484614372,
1134
- "learning_rate": 0.00030314232902033274,
1135
- "loss": 0.009,
1136
  "step": 3825
1137
  },
1138
  {
1139
  "epoch": 7.05,
1140
- "grad_norm": 0.026063207536935806,
1141
- "learning_rate": 0.00029852125693160815,
1142
- "loss": 0.0079,
1143
  "step": 3850
1144
  },
1145
  {
1146
  "epoch": 7.1,
1147
- "grad_norm": 0.021074611693620682,
1148
- "learning_rate": 0.00029390018484288355,
1149
- "loss": 0.0042,
1150
  "step": 3875
1151
  },
1152
  {
1153
  "epoch": 7.14,
1154
- "grad_norm": 0.15062950551509857,
1155
- "learning_rate": 0.00028927911275415896,
1156
- "loss": 0.0073,
1157
  "step": 3900
1158
  },
1159
  {
1160
  "epoch": 7.19,
1161
- "grad_norm": 0.12703749537467957,
1162
- "learning_rate": 0.00028465804066543437,
1163
- "loss": 0.0047,
1164
  "step": 3925
1165
  },
1166
  {
1167
  "epoch": 7.23,
1168
- "grad_norm": 0.0032193493098020554,
1169
- "learning_rate": 0.0002800369685767098,
1170
- "loss": 0.0071,
1171
  "step": 3950
1172
  },
1173
  {
1174
  "epoch": 7.28,
1175
- "grad_norm": 0.046458516269922256,
1176
- "learning_rate": 0.0002754158964879852,
1177
- "loss": 0.0074,
1178
  "step": 3975
1179
  },
1180
  {
1181
  "epoch": 7.33,
1182
- "grad_norm": 0.0037984629161655903,
1183
- "learning_rate": 0.00027079482439926065,
1184
- "loss": 0.0059,
1185
  "step": 4000
1186
  },
1187
  {
1188
  "epoch": 7.37,
1189
- "grad_norm": 0.14948821067810059,
1190
- "learning_rate": 0.00026617375231053605,
1191
- "loss": 0.0058,
1192
  "step": 4025
1193
  },
1194
  {
1195
  "epoch": 7.42,
1196
- "grad_norm": 0.07740973681211472,
1197
- "learning_rate": 0.00026155268022181146,
1198
- "loss": 0.0069,
1199
  "step": 4050
1200
  },
1201
  {
1202
  "epoch": 7.46,
1203
- "grad_norm": 0.0008731162524782121,
1204
- "learning_rate": 0.00025693160813308687,
1205
- "loss": 0.0077,
1206
  "step": 4075
1207
  },
1208
  {
1209
  "epoch": 7.51,
1210
- "grad_norm": 0.001257123309187591,
1211
- "learning_rate": 0.0002523105360443623,
1212
- "loss": 0.0046,
1213
  "step": 4100
1214
  },
1215
  {
1216
  "epoch": 7.55,
1217
- "grad_norm": 0.042853593826293945,
1218
- "learning_rate": 0.00024768946395563774,
1219
- "loss": 0.0065,
1220
  "step": 4125
1221
  },
1222
  {
1223
  "epoch": 7.6,
1224
- "grad_norm": 0.0009361489792354405,
1225
- "learning_rate": 0.00024306839186691312,
1226
- "loss": 0.0073,
1227
  "step": 4150
1228
  },
1229
  {
1230
  "epoch": 7.65,
1231
- "grad_norm": 0.04179251566529274,
1232
- "learning_rate": 0.00023844731977818855,
1233
- "loss": 0.0043,
1234
  "step": 4175
1235
  },
1236
  {
1237
  "epoch": 7.69,
1238
- "grad_norm": 0.008879727683961391,
1239
- "learning_rate": 0.00023382624768946396,
1240
- "loss": 0.0095,
1241
  "step": 4200
1242
  },
1243
  {
1244
  "epoch": 7.74,
1245
- "grad_norm": 0.12861858308315277,
1246
- "learning_rate": 0.00022920517560073937,
1247
- "loss": 0.008,
1248
  "step": 4225
1249
  },
1250
  {
1251
  "epoch": 7.78,
1252
- "grad_norm": 0.01530044712126255,
1253
- "learning_rate": 0.0002245841035120148,
1254
- "loss": 0.0032,
1255
  "step": 4250
1256
  },
1257
  {
1258
  "epoch": 7.83,
1259
- "grad_norm": 0.02794441021978855,
1260
- "learning_rate": 0.0002199630314232902,
1261
- "loss": 0.0087,
1262
  "step": 4275
1263
  },
1264
  {
1265
  "epoch": 7.88,
1266
- "grad_norm": 0.16127441823482513,
1267
- "learning_rate": 0.00021534195933456564,
1268
- "loss": 0.0063,
1269
  "step": 4300
1270
  },
1271
  {
1272
  "epoch": 7.92,
1273
- "grad_norm": 0.09033193439245224,
1274
- "learning_rate": 0.00021072088724584105,
1275
- "loss": 0.0086,
1276
  "step": 4325
1277
  },
1278
  {
1279
  "epoch": 7.97,
1280
- "grad_norm": 0.016767054796218872,
1281
- "learning_rate": 0.00020609981515711646,
1282
- "loss": 0.0102,
1283
  "step": 4350
1284
  },
1285
  {
1286
  "epoch": 8.0,
1287
- "eval_loss": 0.21702823042869568,
1288
- "eval_runtime": 460.637,
1289
- "eval_samples_per_second": 1.765,
1290
- "eval_steps_per_second": 0.295,
1291
  "step": 4368
1292
  },
1293
  {
1294
  "epoch": 8.01,
1295
- "grad_norm": 0.10574544966220856,
1296
- "learning_rate": 0.00020147874306839186,
1297
- "loss": 0.0047,
1298
  "step": 4375
1299
  },
1300
  {
1301
  "epoch": 8.06,
1302
- "grad_norm": 0.0006509744562208652,
1303
- "learning_rate": 0.00019685767097966727,
1304
- "loss": 0.005,
1305
  "step": 4400
1306
  },
1307
  {
1308
  "epoch": 8.1,
1309
- "grad_norm": 0.0009672873420640826,
1310
- "learning_rate": 0.0001922365988909427,
1311
- "loss": 0.0046,
1312
  "step": 4425
1313
  },
1314
  {
1315
  "epoch": 8.15,
1316
- "grad_norm": 0.07224971055984497,
1317
- "learning_rate": 0.0001876155268022181,
1318
- "loss": 0.0043,
1319
  "step": 4450
1320
  },
1321
  {
1322
  "epoch": 8.2,
1323
- "grad_norm": 0.12703950703144073,
1324
- "learning_rate": 0.00018299445471349355,
1325
- "loss": 0.0058,
1326
  "step": 4475
1327
  },
1328
  {
1329
  "epoch": 8.24,
1330
- "grad_norm": 0.0013393750414252281,
1331
- "learning_rate": 0.00017837338262476895,
1332
- "loss": 0.0041,
1333
  "step": 4500
1334
  },
1335
  {
1336
  "epoch": 8.29,
1337
- "grad_norm": 0.03772176802158356,
1338
- "learning_rate": 0.00017375231053604436,
1339
- "loss": 0.0043,
1340
  "step": 4525
1341
  },
1342
  {
1343
  "epoch": 8.33,
1344
- "grad_norm": 0.0009993729181587696,
1345
- "learning_rate": 0.0001691312384473198,
1346
- "loss": 0.0051,
1347
  "step": 4550
1348
  },
1349
  {
1350
  "epoch": 8.38,
1351
- "grad_norm": 0.0068801455199718475,
1352
- "learning_rate": 0.0001645101663585952,
1353
- "loss": 0.0042,
1354
  "step": 4575
1355
  },
1356
  {
1357
  "epoch": 8.42,
1358
- "grad_norm": 0.05337873101234436,
1359
- "learning_rate": 0.0001598890942698706,
1360
- "loss": 0.0048,
1361
  "step": 4600
1362
  },
1363
  {
1364
  "epoch": 8.47,
1365
- "grad_norm": 0.022360146045684814,
1366
- "learning_rate": 0.00015526802218114602,
1367
- "loss": 0.0058,
1368
  "step": 4625
1369
  },
1370
  {
1371
  "epoch": 8.52,
1372
- "grad_norm": 0.13477857410907745,
1373
- "learning_rate": 0.00015064695009242142,
1374
- "loss": 0.0029,
1375
  "step": 4650
1376
  },
1377
  {
1378
  "epoch": 8.56,
1379
- "grad_norm": 0.23147088289260864,
1380
- "learning_rate": 0.00014602587800369686,
1381
- "loss": 0.0057,
1382
  "step": 4675
1383
  },
1384
  {
1385
  "epoch": 8.61,
1386
- "grad_norm": 0.0034095763694494963,
1387
- "learning_rate": 0.0001414048059149723,
1388
- "loss": 0.0043,
1389
  "step": 4700
1390
  },
1391
  {
1392
  "epoch": 8.65,
1393
- "grad_norm": 0.024832753464579582,
1394
- "learning_rate": 0.0001367837338262477,
1395
- "loss": 0.0043,
1396
  "step": 4725
1397
  },
1398
  {
1399
  "epoch": 8.7,
1400
- "grad_norm": 0.0007142982794903219,
1401
- "learning_rate": 0.0001321626617375231,
1402
- "loss": 0.0052,
1403
  "step": 4750
1404
  },
1405
  {
1406
  "epoch": 8.75,
1407
- "grad_norm": 0.02869781292974949,
1408
- "learning_rate": 0.00012754158964879852,
1409
- "loss": 0.0027,
1410
  "step": 4775
1411
  },
1412
  {
1413
  "epoch": 8.79,
1414
- "grad_norm": 0.0004513516614679247,
1415
- "learning_rate": 0.00012292051756007395,
1416
- "loss": 0.0036,
1417
  "step": 4800
1418
  },
1419
  {
1420
  "epoch": 8.84,
1421
- "grad_norm": 0.0008141061407513916,
1422
- "learning_rate": 0.00011829944547134936,
1423
- "loss": 0.0063,
1424
  "step": 4825
1425
  },
1426
  {
1427
  "epoch": 8.88,
1428
- "grad_norm": 0.054795410484075546,
1429
- "learning_rate": 0.00011367837338262476,
1430
- "loss": 0.0021,
1431
  "step": 4850
1432
  },
1433
  {
1434
  "epoch": 8.93,
1435
- "grad_norm": 0.02664073184132576,
1436
- "learning_rate": 0.0001090573012939002,
1437
- "loss": 0.0047,
1438
  "step": 4875
1439
  },
1440
  {
1441
  "epoch": 8.97,
1442
- "grad_norm": 0.0008485654252581298,
1443
- "learning_rate": 0.0001044362292051756,
1444
- "loss": 0.0023,
1445
  "step": 4900
1446
  },
1447
  {
1448
  "epoch": 9.0,
1449
- "eval_loss": 0.22940348088741302,
1450
- "eval_runtime": 460.0411,
1451
- "eval_samples_per_second": 1.767,
1452
- "eval_steps_per_second": 0.296,
1453
  "step": 4914
1454
  },
1455
  {
1456
  "epoch": 9.02,
1457
- "grad_norm": 0.03195321932435036,
1458
- "learning_rate": 9.981515711645101e-05,
1459
- "loss": 0.0034,
1460
  "step": 4925
1461
  },
1462
  {
1463
  "epoch": 9.07,
1464
- "grad_norm": 0.0012715512420982122,
1465
- "learning_rate": 9.519408502772643e-05,
1466
- "loss": 0.0022,
1467
  "step": 4950
1468
  },
1469
  {
1470
  "epoch": 9.11,
1471
- "grad_norm": 0.010319654829800129,
1472
- "learning_rate": 9.057301293900184e-05,
1473
- "loss": 0.0024,
1474
  "step": 4975
1475
  },
1476
  {
1477
  "epoch": 9.16,
1478
- "grad_norm": 0.003004108089953661,
1479
- "learning_rate": 8.595194085027728e-05,
1480
- "loss": 0.0043,
1481
  "step": 5000
1482
  },
1483
  {
1484
  "epoch": 9.2,
1485
- "grad_norm": 0.0019885245710611343,
1486
- "learning_rate": 8.133086876155268e-05,
1487
- "loss": 0.0022,
1488
  "step": 5025
1489
  },
1490
  {
1491
  "epoch": 9.25,
1492
- "grad_norm": 0.06616940349340439,
1493
- "learning_rate": 7.67097966728281e-05,
1494
- "loss": 0.0022,
1495
  "step": 5050
1496
  },
1497
  {
1498
  "epoch": 9.29,
1499
- "grad_norm": 0.044188376516103745,
1500
- "learning_rate": 7.208872458410351e-05,
1501
- "loss": 0.0021,
1502
  "step": 5075
1503
  },
1504
  {
1505
  "epoch": 9.34,
1506
- "grad_norm": 0.07102042436599731,
1507
- "learning_rate": 6.746765249537892e-05,
1508
- "loss": 0.0023,
1509
  "step": 5100
1510
  },
1511
  {
1512
  "epoch": 9.39,
1513
- "grad_norm": 0.018956031650304794,
1514
- "learning_rate": 6.284658040665435e-05,
1515
- "loss": 0.002,
1516
  "step": 5125
1517
  },
1518
  {
1519
  "epoch": 9.43,
1520
- "grad_norm": 0.007081813644617796,
1521
- "learning_rate": 5.822550831792976e-05,
1522
- "loss": 0.0012,
1523
  "step": 5150
1524
  },
1525
  {
1526
  "epoch": 9.48,
1527
- "grad_norm": 0.004433480557054281,
1528
- "learning_rate": 5.3604436229205174e-05,
1529
- "loss": 0.002,
1530
  "step": 5175
1531
  },
1532
  {
1533
  "epoch": 9.52,
1534
- "grad_norm": 0.0015681196236982942,
1535
- "learning_rate": 4.8983364140480595e-05,
1536
- "loss": 0.002,
1537
  "step": 5200
1538
  },
1539
  {
1540
  "epoch": 9.57,
1541
- "grad_norm": 0.003421030705794692,
1542
- "learning_rate": 4.436229205175601e-05,
1543
- "loss": 0.0018,
1544
  "step": 5225
1545
  },
1546
  {
1547
  "epoch": 9.62,
1548
- "grad_norm": 0.13036024570465088,
1549
- "learning_rate": 3.974121996303143e-05,
1550
- "loss": 0.0016,
1551
  "step": 5250
1552
  },
1553
  {
1554
  "epoch": 9.66,
1555
- "grad_norm": 0.05646170675754547,
1556
- "learning_rate": 3.5120147874306844e-05,
1557
- "loss": 0.0017,
1558
  "step": 5275
1559
  },
1560
  {
1561
  "epoch": 9.71,
1562
- "grad_norm": 0.11519595235586166,
1563
- "learning_rate": 3.0499075785582258e-05,
1564
- "loss": 0.0041,
1565
  "step": 5300
1566
  },
1567
  {
1568
  "epoch": 9.75,
1569
- "grad_norm": 0.08969979733228683,
1570
- "learning_rate": 2.5878003696857672e-05,
1571
- "loss": 0.0022,
1572
  "step": 5325
1573
  },
1574
  {
1575
  "epoch": 9.8,
1576
- "grad_norm": 0.13669085502624512,
1577
- "learning_rate": 2.1256931608133086e-05,
1578
- "loss": 0.0014,
1579
  "step": 5350
1580
  },
1581
  {
1582
  "epoch": 9.84,
1583
- "grad_norm": 0.13668496906757355,
1584
- "learning_rate": 1.6635859519408503e-05,
1585
- "loss": 0.0021,
1586
  "step": 5375
1587
  },
1588
  {
1589
  "epoch": 9.89,
1590
- "grad_norm": 0.018298327922821045,
1591
- "learning_rate": 1.2014787430683919e-05,
1592
- "loss": 0.0023,
1593
  "step": 5400
1594
  },
1595
  {
1596
  "epoch": 9.94,
1597
- "grad_norm": 0.007483182940632105,
1598
- "learning_rate": 7.393715341959335e-06,
1599
- "loss": 0.0024,
1600
  "step": 5425
1601
  },
1602
  {
1603
  "epoch": 9.98,
1604
- "grad_norm": 0.03663533180952072,
1605
- "learning_rate": 2.7726432532347505e-06,
1606
- "loss": 0.0024,
1607
  "step": 5450
1608
  },
1609
  {
1610
  "epoch": 10.0,
1611
- "eval_loss": 0.23311161994934082,
1612
- "eval_runtime": 467.6697,
1613
- "eval_samples_per_second": 1.738,
1614
- "eval_steps_per_second": 0.291,
1615
  "step": 5460
1616
  },
1617
  {
1618
  "epoch": 10.0,
1619
  "step": 5460,
1620
  "total_flos": 1.135723105419264e+20,
1621
- "train_loss": 0.04421832180674096,
1622
- "train_runtime": 28971.5279,
1623
- "train_samples_per_second": 1.13,
1624
  "train_steps_per_second": 0.188
1625
  }
1626
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
+ "grad_norm": 0.0019010665128007531,
14
  "learning_rate": 0.0005,
15
+ "loss": 0.0009,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.09,
20
+ "grad_norm": 0.33505979180336,
21
  "learning_rate": 0.001,
22
+ "loss": 0.0059,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.14,
27
+ "grad_norm": 2.521899700164795,
28
+ "learning_rate": 0.0009953789279112755,
29
+ "loss": 0.2569,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.18,
34
+ "grad_norm": 1.3200310468673706,
35
+ "learning_rate": 0.000990757855822551,
36
+ "loss": 2.1666,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.23,
41
+ "grad_norm": 1.1342906951904297,
42
+ "learning_rate": 0.0009861367837338264,
43
+ "loss": 0.3677,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.27,
48
+ "grad_norm": 0.6839067935943604,
49
+ "learning_rate": 0.0009815157116451016,
50
+ "loss": 0.2573,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.32,
55
+ "grad_norm": 1.120778203010559,
56
+ "learning_rate": 0.0009768946395563771,
57
+ "loss": 0.1784,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.37,
62
+ "grad_norm": 0.39630964398384094,
63
+ "learning_rate": 0.0009722735674676525,
64
+ "loss": 0.1007,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 0.41,
69
+ "grad_norm": 0.8115565776824951,
70
+ "learning_rate": 0.0009676524953789279,
71
+ "loss": 0.1009,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 0.46,
76
+ "grad_norm": 0.4256753921508789,
77
+ "learning_rate": 0.0009630314232902033,
78
+ "loss": 0.0613,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 0.5,
83
+ "grad_norm": 1.3296879529953003,
84
+ "learning_rate": 0.0009584103512014787,
85
+ "loss": 0.0656,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 0.55,
90
+ "grad_norm": 0.2332513928413391,
91
+ "learning_rate": 0.0009537892791127542,
92
+ "loss": 0.1056,
93
  "step": 300
94
  },
95
  {
96
  "epoch": 0.6,
97
+ "grad_norm": 0.361102819442749,
98
+ "learning_rate": 0.0009491682070240297,
99
+ "loss": 0.0714,
100
  "step": 325
101
  },
102
  {
103
  "epoch": 0.64,
104
+ "grad_norm": 0.3176443576812744,
105
+ "learning_rate": 0.000944547134935305,
106
+ "loss": 0.051,
107
  "step": 350
108
  },
109
  {
110
  "epoch": 0.69,
111
+ "grad_norm": 0.5561681389808655,
112
+ "learning_rate": 0.0009399260628465805,
113
+ "loss": 0.0987,
114
  "step": 375
115
  },
116
  {
117
  "epoch": 0.73,
118
+ "grad_norm": 0.17937970161437988,
119
+ "learning_rate": 0.0009353049907578558,
120
+ "loss": 0.0705,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 0.78,
125
+ "grad_norm": 0.11134567111730576,
126
+ "learning_rate": 0.0009306839186691313,
127
+ "loss": 0.0449,
128
  "step": 425
129
  },
130
  {
131
  "epoch": 0.82,
132
+ "grad_norm": 0.1100844293832779,
133
+ "learning_rate": 0.0009260628465804066,
134
+ "loss": 0.0605,
135
  "step": 450
136
  },
137
  {
138
  "epoch": 0.87,
139
+ "grad_norm": 0.5044831037521362,
140
+ "learning_rate": 0.0009214417744916821,
141
+ "loss": 0.0682,
142
  "step": 475
143
  },
144
  {
145
  "epoch": 0.92,
146
+ "grad_norm": 0.1506507396697998,
147
+ "learning_rate": 0.0009168207024029575,
148
+ "loss": 0.0382,
149
  "step": 500
150
  },
151
  {
152
  "epoch": 0.96,
153
+ "grad_norm": 0.358964741230011,
154
+ "learning_rate": 0.0009121996303142329,
155
+ "loss": 0.0416,
156
  "step": 525
157
  },
158
  {
159
  "epoch": 1.0,
160
+ "eval_loss": 0.22685140371322632,
161
+ "eval_runtime": 467.2357,
162
+ "eval_samples_per_second": 1.74,
163
+ "eval_steps_per_second": 0.291,
164
  "step": 546
165
  },
166
  {
167
  "epoch": 1.01,
168
+ "grad_norm": 0.21843843162059784,
169
+ "learning_rate": 0.0009075785582255084,
170
+ "loss": 0.0283,
171
  "step": 550
172
  },
173
  {
174
  "epoch": 1.05,
175
+ "grad_norm": 0.3907661437988281,
176
+ "learning_rate": 0.0009029574861367837,
177
+ "loss": 0.0322,
178
  "step": 575
179
  },
180
  {
181
  "epoch": 1.1,
182
+ "grad_norm": 0.026330502703785896,
183
+ "learning_rate": 0.0008983364140480592,
184
+ "loss": 0.021,
185
  "step": 600
186
  },
187
  {
188
  "epoch": 1.14,
189
+ "grad_norm": 0.015783503651618958,
190
+ "learning_rate": 0.0008937153419593346,
191
+ "loss": 0.0237,
192
  "step": 625
193
  },
194
  {
195
  "epoch": 1.19,
196
+ "grad_norm": 0.07014349848031998,
197
+ "learning_rate": 0.00088909426987061,
198
+ "loss": 0.0307,
199
  "step": 650
200
  },
201
  {
202
  "epoch": 1.24,
203
+ "grad_norm": 0.05321989953517914,
204
+ "learning_rate": 0.0008844731977818854,
205
+ "loss": 0.0217,
206
  "step": 675
207
  },
208
  {
209
  "epoch": 1.28,
210
+ "grad_norm": 0.07509706914424896,
211
+ "learning_rate": 0.0008798521256931608,
212
+ "loss": 0.0319,
213
  "step": 700
214
  },
215
  {
216
  "epoch": 1.33,
217
+ "grad_norm": 0.239598348736763,
218
+ "learning_rate": 0.0008752310536044362,
219
+ "loss": 0.0373,
220
  "step": 725
221
  },
222
  {
223
  "epoch": 1.37,
224
+ "grad_norm": 0.1276847869157791,
225
+ "learning_rate": 0.0008706099815157116,
226
+ "loss": 0.0308,
227
  "step": 750
228
  },
229
  {
230
  "epoch": 1.42,
231
+ "grad_norm": 0.17856040596961975,
232
+ "learning_rate": 0.000865988909426987,
233
+ "loss": 0.0326,
234
  "step": 775
235
  },
236
  {
237
  "epoch": 1.47,
238
+ "grad_norm": 0.5299984216690063,
239
+ "learning_rate": 0.0008613678373382626,
240
+ "loss": 0.034,
241
  "step": 800
242
  },
243
  {
244
  "epoch": 1.51,
245
+ "grad_norm": 0.18570055067539215,
246
+ "learning_rate": 0.0008567467652495379,
247
+ "loss": 0.0255,
248
  "step": 825
249
  },
250
  {
251
  "epoch": 1.56,
252
+ "grad_norm": 0.012400169856846333,
253
+ "learning_rate": 0.0008521256931608134,
254
+ "loss": 0.0133,
255
  "step": 850
256
  },
257
  {
258
  "epoch": 1.6,
259
+ "grad_norm": 0.18435439467430115,
260
+ "learning_rate": 0.0008475046210720887,
261
+ "loss": 0.0392,
262
  "step": 875
263
  },
264
  {
265
  "epoch": 1.65,
266
+ "grad_norm": 0.20227985084056854,
267
+ "learning_rate": 0.0008428835489833642,
268
+ "loss": 0.0339,
269
  "step": 900
270
  },
271
  {
272
  "epoch": 1.69,
273
+ "grad_norm": 0.09818145632743835,
274
+ "learning_rate": 0.0008382624768946395,
275
+ "loss": 0.039,
276
  "step": 925
277
  },
278
  {
279
  "epoch": 1.74,
280
+ "grad_norm": 0.17143449187278748,
281
+ "learning_rate": 0.000833641404805915,
282
+ "loss": 0.0256,
283
  "step": 950
284
  },
285
  {
286
  "epoch": 1.79,
287
+ "grad_norm": 0.3052353858947754,
288
+ "learning_rate": 0.0008290203327171904,
289
+ "loss": 0.0279,
290
  "step": 975
291
  },
292
  {
293
  "epoch": 1.83,
294
+ "grad_norm": 0.09069275110960007,
295
+ "learning_rate": 0.0008243992606284658,
296
+ "loss": 0.0253,
297
  "step": 1000
298
  },
299
  {
300
  "epoch": 1.88,
301
+ "grad_norm": 0.7721070647239685,
302
+ "learning_rate": 0.0008197781885397413,
303
+ "loss": 0.0222,
304
  "step": 1025
305
  },
306
  {
307
  "epoch": 1.92,
308
+ "grad_norm": 0.2378959357738495,
309
+ "learning_rate": 0.0008151571164510166,
310
+ "loss": 0.0386,
311
  "step": 1050
312
  },
313
  {
314
  "epoch": 1.97,
315
+ "grad_norm": 0.004904525820165873,
316
+ "learning_rate": 0.0008105360443622921,
317
+ "loss": 0.0243,
318
  "step": 1075
319
  },
320
  {
321
  "epoch": 2.0,
322
+ "eval_loss": 0.20544852316379547,
323
+ "eval_runtime": 463.5714,
324
+ "eval_samples_per_second": 1.754,
325
+ "eval_steps_per_second": 0.293,
326
  "step": 1092
327
  },
328
  {
329
  "epoch": 2.01,
330
+ "grad_norm": 0.13851934671401978,
331
+ "learning_rate": 0.0008059149722735675,
332
+ "loss": 0.0267,
333
  "step": 1100
334
  },
335
  {
336
  "epoch": 2.06,
337
+ "grad_norm": 0.01875193975865841,
338
+ "learning_rate": 0.0008012939001848429,
339
+ "loss": 0.0117,
340
  "step": 1125
341
  },
342
  {
343
  "epoch": 2.11,
344
+ "grad_norm": 0.0016854548593983054,
345
+ "learning_rate": 0.0007966728280961183,
346
+ "loss": 0.0317,
347
  "step": 1150
348
  },
349
  {
350
  "epoch": 2.15,
351
+ "grad_norm": 0.10259977728128433,
352
+ "learning_rate": 0.0007920517560073937,
353
+ "loss": 0.0253,
354
  "step": 1175
355
  },
356
  {
357
  "epoch": 2.2,
358
+ "grad_norm": 0.054936520755290985,
359
+ "learning_rate": 0.0007874306839186691,
360
+ "loss": 0.0228,
361
  "step": 1200
362
  },
363
  {
364
  "epoch": 2.24,
365
+ "grad_norm": 0.08183781057596207,
366
+ "learning_rate": 0.0007828096118299445,
367
+ "loss": 0.0163,
368
  "step": 1225
369
  },
370
  {
371
  "epoch": 2.29,
372
+ "grad_norm": 0.03448671102523804,
373
+ "learning_rate": 0.0007781885397412199,
374
+ "loss": 0.0252,
375
  "step": 1250
376
  },
377
  {
378
  "epoch": 2.34,
379
+ "grad_norm": 0.026254719123244286,
380
+ "learning_rate": 0.0007735674676524955,
381
+ "loss": 0.0263,
382
  "step": 1275
383
  },
384
  {
385
  "epoch": 2.38,
386
+ "grad_norm": 0.07833431661128998,
387
+ "learning_rate": 0.0007689463955637708,
388
+ "loss": 0.0268,
389
  "step": 1300
390
  },
391
  {
392
  "epoch": 2.43,
393
+ "grad_norm": 0.3215916156768799,
394
+ "learning_rate": 0.0007643253234750463,
395
+ "loss": 0.0184,
396
  "step": 1325
397
  },
398
  {
399
  "epoch": 2.47,
400
+ "grad_norm": 0.1949949562549591,
401
+ "learning_rate": 0.0007597042513863216,
402
+ "loss": 0.0234,
403
  "step": 1350
404
  },
405
  {
406
  "epoch": 2.52,
407
+ "grad_norm": 0.10761301964521408,
408
+ "learning_rate": 0.0007550831792975971,
409
+ "loss": 0.0214,
410
  "step": 1375
411
  },
412
  {
413
  "epoch": 2.56,
414
+ "grad_norm": 0.23488566279411316,
415
+ "learning_rate": 0.0007504621072088724,
416
+ "loss": 0.0321,
417
  "step": 1400
418
  },
419
  {
420
  "epoch": 2.61,
421
+ "grad_norm": 0.16079489886760712,
422
+ "learning_rate": 0.0007458410351201479,
423
+ "loss": 0.041,
424
  "step": 1425
425
  },
426
  {
427
  "epoch": 2.66,
428
+ "grad_norm": 0.3500367999076843,
429
+ "learning_rate": 0.0007412199630314234,
430
+ "loss": 0.0218,
431
  "step": 1450
432
  },
433
  {
434
  "epoch": 2.7,
435
+ "grad_norm": 0.01966880075633526,
436
+ "learning_rate": 0.0007365988909426987,
437
+ "loss": 0.0185,
438
  "step": 1475
439
  },
440
  {
441
  "epoch": 2.75,
442
+ "grad_norm": 0.10084854066371918,
443
+ "learning_rate": 0.0007319778188539742,
444
+ "loss": 0.0158,
445
  "step": 1500
446
  },
447
  {
448
  "epoch": 2.79,
449
+ "grad_norm": 0.045843809843063354,
450
+ "learning_rate": 0.0007273567467652495,
451
+ "loss": 0.0193,
452
  "step": 1525
453
  },
454
  {
455
  "epoch": 2.84,
456
+ "grad_norm": 0.19230197370052338,
457
+ "learning_rate": 0.000722735674676525,
458
+ "loss": 0.0115,
459
  "step": 1550
460
  },
461
  {
462
  "epoch": 2.88,
463
+ "grad_norm": 0.10168833285570145,
464
+ "learning_rate": 0.0007181146025878004,
465
+ "loss": 0.0173,
466
  "step": 1575
467
  },
468
  {
469
  "epoch": 2.93,
470
+ "grad_norm": 0.24770613014698029,
471
+ "learning_rate": 0.0007134935304990758,
472
+ "loss": 0.019,
473
  "step": 1600
474
  },
475
  {
476
  "epoch": 2.98,
477
+ "grad_norm": 0.04277370125055313,
478
+ "learning_rate": 0.0007088724584103512,
479
+ "loss": 0.0262,
480
  "step": 1625
481
  },
482
  {
483
  "epoch": 3.0,
484
+ "eval_loss": 0.18655328452587128,
485
+ "eval_runtime": 463.8687,
486
+ "eval_samples_per_second": 1.753,
487
+ "eval_steps_per_second": 0.293,
488
  "step": 1638
489
  },
490
  {
491
  "epoch": 3.02,
492
+ "grad_norm": 0.03143952414393425,
493
+ "learning_rate": 0.0007042513863216266,
494
+ "loss": 0.0185,
495
  "step": 1650
496
  },
497
  {
498
  "epoch": 3.07,
499
+ "grad_norm": 0.061480745673179626,
500
+ "learning_rate": 0.000699630314232902,
501
+ "loss": 0.0158,
502
  "step": 1675
503
  },
504
  {
505
  "epoch": 3.11,
506
+ "grad_norm": 0.05645143985748291,
507
+ "learning_rate": 0.0006950092421441774,
508
+ "loss": 0.0163,
509
  "step": 1700
510
  },
511
  {
512
  "epoch": 3.16,
513
+ "grad_norm": 0.3927539885044098,
514
+ "learning_rate": 0.0006903881700554528,
515
+ "loss": 0.0257,
516
  "step": 1725
517
  },
518
  {
519
  "epoch": 3.21,
520
+ "grad_norm": 0.1579461544752121,
521
+ "learning_rate": 0.0006857670979667284,
522
+ "loss": 0.0203,
523
  "step": 1750
524
  },
525
  {
526
  "epoch": 3.25,
527
+ "grad_norm": 0.003284105099737644,
528
+ "learning_rate": 0.0006811460258780037,
529
+ "loss": 0.012,
530
  "step": 1775
531
  },
532
  {
533
  "epoch": 3.3,
534
+ "grad_norm": 0.0939943715929985,
535
+ "learning_rate": 0.0006765249537892792,
536
+ "loss": 0.0139,
537
  "step": 1800
538
  },
539
  {
540
  "epoch": 3.34,
541
+ "grad_norm": 0.08114974200725555,
542
+ "learning_rate": 0.0006719038817005545,
543
+ "loss": 0.0134,
544
  "step": 1825
545
  },
546
  {
547
  "epoch": 3.39,
548
+ "grad_norm": 0.008277042768895626,
549
+ "learning_rate": 0.00066728280961183,
550
+ "loss": 0.0203,
551
  "step": 1850
552
  },
553
  {
554
  "epoch": 3.43,
555
+ "grad_norm": 0.014137201942503452,
556
+ "learning_rate": 0.0006626617375231053,
557
+ "loss": 0.018,
558
  "step": 1875
559
  },
560
  {
561
  "epoch": 3.48,
562
+ "grad_norm": 0.04209378361701965,
563
+ "learning_rate": 0.0006580406654343808,
564
+ "loss": 0.0107,
565
  "step": 1900
566
  },
567
  {
568
  "epoch": 3.53,
569
+ "grad_norm": 0.015557551756501198,
570
+ "learning_rate": 0.0006534195933456563,
571
+ "loss": 0.0104,
572
  "step": 1925
573
  },
574
  {
575
  "epoch": 3.57,
576
+ "grad_norm": 0.021405475214123726,
577
+ "learning_rate": 0.0006487985212569316,
578
+ "loss": 0.0117,
579
  "step": 1950
580
  },
581
  {
582
  "epoch": 3.62,
583
+ "grad_norm": 0.0015239958884194493,
584
+ "learning_rate": 0.0006441774491682071,
585
+ "loss": 0.0176,
586
  "step": 1975
587
  },
588
  {
589
  "epoch": 3.66,
590
+ "grad_norm": 0.0997876301407814,
591
+ "learning_rate": 0.0006395563770794824,
592
+ "loss": 0.0183,
593
  "step": 2000
594
  },
595
  {
596
  "epoch": 3.71,
597
+ "grad_norm": 0.004715020768344402,
598
+ "learning_rate": 0.0006349353049907579,
599
+ "loss": 0.0199,
600
  "step": 2025
601
  },
602
  {
603
  "epoch": 3.75,
604
+ "grad_norm": 0.1075858548283577,
605
+ "learning_rate": 0.0006303142329020333,
606
+ "loss": 0.0201,
607
  "step": 2050
608
  },
609
  {
610
  "epoch": 3.8,
611
+ "grad_norm": 0.020496558398008347,
612
+ "learning_rate": 0.0006256931608133087,
613
+ "loss": 0.0145,
614
  "step": 2075
615
  },
616
  {
617
  "epoch": 3.85,
618
+ "grad_norm": 0.11063025891780853,
619
+ "learning_rate": 0.0006210720887245841,
620
  "loss": 0.0201,
621
  "step": 2100
622
  },
623
  {
624
  "epoch": 3.89,
625
+ "grad_norm": 0.1012192815542221,
626
+ "learning_rate": 0.0006164510166358595,
627
+ "loss": 0.0223,
628
  "step": 2125
629
  },
630
  {
631
  "epoch": 3.94,
632
+ "grad_norm": 0.04694315418601036,
633
+ "learning_rate": 0.0006118299445471349,
634
+ "loss": 0.0163,
635
  "step": 2150
636
  },
637
  {
638
  "epoch": 3.98,
639
+ "grad_norm": 0.05395512282848358,
640
+ "learning_rate": 0.0006072088724584103,
641
+ "loss": 0.009,
642
  "step": 2175
643
  },
644
  {
645
  "epoch": 4.0,
646
+ "eval_loss": 0.20004291832447052,
647
+ "eval_runtime": 464.3668,
648
+ "eval_samples_per_second": 1.751,
649
+ "eval_steps_per_second": 0.293,
650
  "step": 2184
651
  },
652
  {
653
  "epoch": 4.03,
654
+ "grad_norm": 0.08517912030220032,
655
+ "learning_rate": 0.0006025878003696857,
656
+ "loss": 0.0152,
657
  "step": 2200
658
  },
659
  {
660
  "epoch": 4.08,
661
+ "grad_norm": 0.23693686723709106,
662
+ "learning_rate": 0.0005979667282809613,
663
+ "loss": 0.0123,
664
  "step": 2225
665
  },
666
  {
667
  "epoch": 4.12,
668
+ "grad_norm": 0.04390133172273636,
669
+ "learning_rate": 0.0005933456561922366,
670
+ "loss": 0.0104,
671
  "step": 2250
672
  },
673
  {
674
  "epoch": 4.17,
675
+ "grad_norm": 0.048480305820703506,
676
+ "learning_rate": 0.0005887245841035121,
677
+ "loss": 0.0191,
678
  "step": 2275
679
  },
680
  {
681
  "epoch": 4.21,
682
+ "grad_norm": 0.07334431260824203,
683
+ "learning_rate": 0.0005841035120147874,
684
+ "loss": 0.0079,
685
  "step": 2300
686
  },
687
  {
688
  "epoch": 4.26,
689
+ "grad_norm": 0.26686009764671326,
690
+ "learning_rate": 0.0005794824399260629,
691
+ "loss": 0.0134,
692
  "step": 2325
693
  },
694
  {
695
  "epoch": 4.3,
696
+ "grad_norm": 0.18834412097930908,
697
+ "learning_rate": 0.0005748613678373382,
698
+ "loss": 0.0108,
699
  "step": 2350
700
  },
701
  {
702
  "epoch": 4.35,
703
+ "grad_norm": 0.11365604400634766,
704
+ "learning_rate": 0.0005702402957486137,
705
+ "loss": 0.0116,
706
  "step": 2375
707
  },
708
  {
709
  "epoch": 4.4,
710
+ "grad_norm": 0.21077445149421692,
711
+ "learning_rate": 0.0005656192236598892,
712
+ "loss": 0.017,
713
  "step": 2400
714
  },
715
  {
716
  "epoch": 4.44,
717
+ "grad_norm": 0.14450936019420624,
718
+ "learning_rate": 0.0005609981515711645,
719
+ "loss": 0.0056,
720
  "step": 2425
721
  },
722
  {
723
  "epoch": 4.49,
724
+ "grad_norm": 0.07659462839365005,
725
+ "learning_rate": 0.00055637707948244,
726
+ "loss": 0.0128,
727
  "step": 2450
728
  },
729
  {
730
  "epoch": 4.53,
731
+ "grad_norm": 0.07819797843694687,
732
+ "learning_rate": 0.0005517560073937153,
733
+ "loss": 0.0085,
734
  "step": 2475
735
  },
736
  {
737
  "epoch": 4.58,
738
+ "grad_norm": 0.10529200732707977,
739
+ "learning_rate": 0.0005471349353049908,
740
  "loss": 0.0156,
741
  "step": 2500
742
  },
743
  {
744
  "epoch": 4.62,
745
+ "grad_norm": 0.034541305154561996,
746
+ "learning_rate": 0.0005425138632162662,
747
+ "loss": 0.0114,
748
  "step": 2525
749
  },
750
  {
751
  "epoch": 4.67,
752
+ "grad_norm": 0.0043388293124735355,
753
+ "learning_rate": 0.0005378927911275416,
754
+ "loss": 0.0114,
755
  "step": 2550
756
  },
757
  {
758
  "epoch": 4.72,
759
+ "grad_norm": 0.09843795001506805,
760
+ "learning_rate": 0.000533271719038817,
761
+ "loss": 0.0097,
762
  "step": 2575
763
  },
764
  {
765
  "epoch": 4.76,
766
+ "grad_norm": 0.1924191564321518,
767
+ "learning_rate": 0.0005286506469500924,
768
+ "loss": 0.0138,
769
  "step": 2600
770
  },
771
  {
772
  "epoch": 4.81,
773
+ "grad_norm": 0.0032940045930445194,
774
+ "learning_rate": 0.0005240295748613678,
775
+ "loss": 0.009,
776
  "step": 2625
777
  },
778
  {
779
  "epoch": 4.85,
780
+ "grad_norm": 0.17411276698112488,
781
+ "learning_rate": 0.0005194085027726432,
782
+ "loss": 0.005,
783
  "step": 2650
784
  },
785
  {
786
  "epoch": 4.9,
787
+ "grad_norm": 0.0008068850729614496,
788
+ "learning_rate": 0.0005147874306839186,
789
+ "loss": 0.0091,
790
  "step": 2675
791
  },
792
  {
793
  "epoch": 4.95,
794
+ "grad_norm": 0.013785873539745808,
795
+ "learning_rate": 0.0005101663585951941,
796
+ "loss": 0.0174,
797
  "step": 2700
798
  },
799
  {
800
  "epoch": 4.99,
801
+ "grad_norm": 0.06957102566957474,
802
+ "learning_rate": 0.0005055452865064695,
803
+ "loss": 0.0196,
804
  "step": 2725
805
  },
806
  {
807
  "epoch": 5.0,
808
+ "eval_loss": 0.1927657425403595,
809
+ "eval_runtime": 464.5332,
810
+ "eval_samples_per_second": 1.75,
811
+ "eval_steps_per_second": 0.293,
812
  "step": 2730
813
  },
814
  {
815
  "epoch": 5.04,
816
+ "grad_norm": 0.1873362511396408,
817
+ "learning_rate": 0.000500924214417745,
818
+ "loss": 0.0114,
819
  "step": 2750
820
  },
821
  {
822
  "epoch": 5.08,
823
+ "grad_norm": 0.013944294303655624,
824
+ "learning_rate": 0.0004963031423290203,
825
+ "loss": 0.0047,
826
  "step": 2775
827
  },
828
  {
829
  "epoch": 5.13,
830
+ "grad_norm": 0.14739681780338287,
831
+ "learning_rate": 0.0004916820702402958,
832
+ "loss": 0.0064,
833
  "step": 2800
834
  },
835
  {
836
  "epoch": 5.17,
837
+ "grad_norm": 0.039295587688684464,
838
+ "learning_rate": 0.00048706099815157115,
839
+ "loss": 0.0061,
840
  "step": 2825
841
  },
842
  {
843
  "epoch": 5.22,
844
+ "grad_norm": 0.009731476195156574,
845
+ "learning_rate": 0.0004824399260628466,
846
+ "loss": 0.0064,
847
  "step": 2850
848
  },
849
  {
850
  "epoch": 5.27,
851
+ "grad_norm": 0.009130421094596386,
852
+ "learning_rate": 0.000477818853974122,
853
+ "loss": 0.0056,
854
  "step": 2875
855
  },
856
  {
857
  "epoch": 5.31,
858
+ "grad_norm": 0.10517439246177673,
859
+ "learning_rate": 0.0004731977818853974,
860
+ "loss": 0.0095,
861
  "step": 2900
862
  },
863
  {
864
  "epoch": 5.36,
865
+ "grad_norm": 0.03147244080901146,
866
+ "learning_rate": 0.00046857670979667283,
867
+ "loss": 0.0069,
868
  "step": 2925
869
  },
870
  {
871
  "epoch": 5.4,
872
+ "grad_norm": 0.07550155371427536,
873
+ "learning_rate": 0.00046395563770794824,
874
+ "loss": 0.0084,
875
  "step": 2950
876
  },
877
  {
878
  "epoch": 5.45,
879
+ "grad_norm": 0.09899873286485672,
880
+ "learning_rate": 0.00045933456561922365,
881
+ "loss": 0.0087,
882
  "step": 2975
883
  },
884
  {
885
  "epoch": 5.49,
886
+ "grad_norm": 0.062454238533973694,
887
+ "learning_rate": 0.00045471349353049906,
888
+ "loss": 0.0114,
889
  "step": 3000
890
  },
891
  {
892
  "epoch": 5.54,
893
+ "grad_norm": 0.14996998012065887,
894
+ "learning_rate": 0.00045009242144177446,
895
+ "loss": 0.0091,
896
  "step": 3025
897
  },
898
  {
899
  "epoch": 5.59,
900
+ "grad_norm": 0.19108814001083374,
901
+ "learning_rate": 0.00044547134935304987,
902
+ "loss": 0.0147,
903
  "step": 3050
904
  },
905
  {
906
  "epoch": 5.63,
907
+ "grad_norm": 0.14450325071811676,
908
+ "learning_rate": 0.00044085027726432533,
909
+ "loss": 0.0152,
910
  "step": 3075
911
  },
912
  {
913
  "epoch": 5.68,
914
+ "grad_norm": 0.04423892870545387,
915
+ "learning_rate": 0.0004362292051756008,
916
+ "loss": 0.006,
917
  "step": 3100
918
  },
919
  {
920
  "epoch": 5.72,
921
+ "grad_norm": 0.13844439387321472,
922
+ "learning_rate": 0.0004316081330868762,
923
+ "loss": 0.009,
924
  "step": 3125
925
  },
926
  {
927
  "epoch": 5.77,
928
+ "grad_norm": 0.0006735218339599669,
929
+ "learning_rate": 0.0004269870609981516,
930
+ "loss": 0.0058,
931
  "step": 3150
932
  },
933
  {
934
  "epoch": 5.82,
935
+ "grad_norm": 0.011760660447180271,
936
+ "learning_rate": 0.000422365988909427,
937
+ "loss": 0.0049,
938
  "step": 3175
939
  },
940
  {
941
  "epoch": 5.86,
942
+ "grad_norm": 0.08969856053590775,
943
+ "learning_rate": 0.0004177449168207024,
944
+ "loss": 0.0065,
945
  "step": 3200
946
  },
947
  {
948
  "epoch": 5.91,
949
+ "grad_norm": 0.12556907534599304,
950
+ "learning_rate": 0.00041312384473197783,
951
+ "loss": 0.0089,
952
  "step": 3225
953
  },
954
  {
955
  "epoch": 5.95,
956
+ "grad_norm": 0.017725255340337753,
957
+ "learning_rate": 0.00040850277264325324,
958
+ "loss": 0.0088,
959
  "step": 3250
960
  },
961
  {
962
  "epoch": 6.0,
963
+ "grad_norm": 0.009897828102111816,
964
+ "learning_rate": 0.00040388170055452864,
965
+ "loss": 0.0071,
966
  "step": 3275
967
  },
968
  {
969
  "epoch": 6.0,
970
+ "eval_loss": 0.20994354784488678,
971
+ "eval_runtime": 463.0078,
972
+ "eval_samples_per_second": 1.756,
973
+ "eval_steps_per_second": 0.294,
974
  "step": 3276
975
  },
976
  {
977
  "epoch": 6.04,
978
+ "grad_norm": 0.0028004159685224295,
979
+ "learning_rate": 0.00039926062846580405,
980
+ "loss": 0.0093,
981
  "step": 3300
982
  },
983
  {
984
  "epoch": 6.09,
985
+ "grad_norm": 0.10490375012159348,
986
+ "learning_rate": 0.0003946395563770795,
987
+ "loss": 0.0053,
988
  "step": 3325
989
  },
990
  {
991
  "epoch": 6.14,
992
+ "grad_norm": 0.019779745489358902,
993
+ "learning_rate": 0.0003900184842883549,
994
+ "loss": 0.0084,
995
  "step": 3350
996
  },
997
  {
998
  "epoch": 6.18,
999
+ "grad_norm": 0.00020589173072949052,
1000
+ "learning_rate": 0.00038539741219963033,
1001
+ "loss": 0.0029,
1002
  "step": 3375
1003
  },
1004
  {
1005
  "epoch": 6.23,
1006
+ "grad_norm": 0.003221085062250495,
1007
+ "learning_rate": 0.00038077634011090574,
1008
+ "loss": 0.0051,
1009
  "step": 3400
1010
  },
1011
  {
1012
  "epoch": 6.27,
1013
+ "grad_norm": 0.00455264188349247,
1014
+ "learning_rate": 0.00037615526802218114,
1015
+ "loss": 0.0063,
1016
  "step": 3425
1017
  },
1018
  {
1019
  "epoch": 6.32,
1020
+ "grad_norm": 0.00967650581151247,
1021
+ "learning_rate": 0.00037153419593345655,
1022
+ "loss": 0.0035,
1023
  "step": 3450
1024
  },
1025
  {
1026
  "epoch": 6.36,
1027
+ "grad_norm": 0.009352604858577251,
1028
+ "learning_rate": 0.00036691312384473196,
1029
+ "loss": 0.0065,
1030
  "step": 3475
1031
  },
1032
  {
1033
  "epoch": 6.41,
1034
+ "grad_norm": 0.002876508515328169,
1035
+ "learning_rate": 0.00036229205175600736,
1036
+ "loss": 0.0049,
1037
  "step": 3500
1038
  },
1039
  {
1040
  "epoch": 6.46,
1041
+ "grad_norm": 0.002914861775934696,
1042
+ "learning_rate": 0.00035767097966728277,
1043
+ "loss": 0.0043,
1044
  "step": 3525
1045
  },
1046
  {
1047
  "epoch": 6.5,
1048
+ "grad_norm": 0.021481545642018318,
1049
+ "learning_rate": 0.0003530499075785583,
1050
+ "loss": 0.0072,
1051
  "step": 3550
1052
  },
1053
  {
1054
  "epoch": 6.55,
1055
+ "grad_norm": 0.08110266923904419,
1056
+ "learning_rate": 0.0003484288354898337,
1057
+ "loss": 0.0044,
1058
  "step": 3575
1059
  },
1060
  {
1061
  "epoch": 6.59,
1062
+ "grad_norm": 0.020943278446793556,
1063
+ "learning_rate": 0.0003438077634011091,
1064
+ "loss": 0.0072,
1065
  "step": 3600
1066
  },
1067
  {
1068
  "epoch": 6.64,
1069
+ "grad_norm": 0.005692564882338047,
1070
+ "learning_rate": 0.0003391866913123845,
1071
+ "loss": 0.0078,
1072
  "step": 3625
1073
  },
1074
  {
1075
  "epoch": 6.68,
1076
+ "grad_norm": 0.11609622091054916,
1077
+ "learning_rate": 0.0003345656192236599,
1078
+ "loss": 0.008,
1079
  "step": 3650
1080
  },
1081
  {
1082
  "epoch": 6.73,
1083
+ "grad_norm": 0.05904560536146164,
1084
+ "learning_rate": 0.0003299445471349353,
1085
+ "loss": 0.0061,
1086
  "step": 3675
1087
  },
1088
  {
1089
  "epoch": 6.78,
1090
+ "grad_norm": 0.03346557542681694,
1091
+ "learning_rate": 0.00032532347504621073,
1092
+ "loss": 0.0069,
1093
  "step": 3700
1094
  },
1095
  {
1096
  "epoch": 6.82,
1097
+ "grad_norm": 0.04848520830273628,
1098
+ "learning_rate": 0.00032070240295748614,
1099
+ "loss": 0.0078,
1100
  "step": 3725
1101
  },
1102
  {
1103
  "epoch": 6.87,
1104
+ "grad_norm": 0.11064545810222626,
1105
+ "learning_rate": 0.00031608133086876155,
1106
+ "loss": 0.0083,
1107
  "step": 3750
1108
  },
1109
  {
1110
  "epoch": 6.91,
1111
+ "grad_norm": 0.001821186626330018,
1112
+ "learning_rate": 0.00031146025878003695,
1113
+ "loss": 0.0084,
1114
  "step": 3775
1115
  },
1116
  {
1117
  "epoch": 6.96,
1118
+ "grad_norm": 0.03919747844338417,
1119
+ "learning_rate": 0.0003068391866913124,
1120
+ "loss": 0.0054,
1121
  "step": 3800
1122
  },
1123
  {
1124
  "epoch": 7.0,
1125
+ "eval_loss": 0.20703129470348358,
1126
+ "eval_runtime": 463.5868,
1127
+ "eval_samples_per_second": 1.754,
1128
+ "eval_steps_per_second": 0.293,
1129
  "step": 3822
1130
  },
1131
  {
1132
  "epoch": 7.01,
1133
+ "grad_norm": 0.008144889958202839,
1134
+ "learning_rate": 0.0003022181146025878,
1135
+ "loss": 0.0057,
1136
  "step": 3825
1137
  },
1138
  {
1139
  "epoch": 7.05,
1140
+ "grad_norm": 0.005378293804824352,
1141
+ "learning_rate": 0.00029759704251386323,
1142
+ "loss": 0.004,
1143
  "step": 3850
1144
  },
1145
  {
1146
  "epoch": 7.1,
1147
+ "grad_norm": 0.03501349315047264,
1148
+ "learning_rate": 0.00029297597042513864,
1149
+ "loss": 0.003,
1150
  "step": 3875
1151
  },
1152
  {
1153
  "epoch": 7.14,
1154
+ "grad_norm": 0.07073014974594116,
1155
+ "learning_rate": 0.00028835489833641404,
1156
+ "loss": 0.0029,
1157
  "step": 3900
1158
  },
1159
  {
1160
  "epoch": 7.19,
1161
+ "grad_norm": 0.09017951786518097,
1162
+ "learning_rate": 0.00028373382624768945,
1163
+ "loss": 0.0027,
1164
  "step": 3925
1165
  },
1166
  {
1167
  "epoch": 7.23,
1168
+ "grad_norm": 0.009881277568638325,
1169
+ "learning_rate": 0.00027911275415896486,
1170
+ "loss": 0.0044,
1171
  "step": 3950
1172
  },
1173
  {
1174
  "epoch": 7.28,
1175
+ "grad_norm": 0.0018990118987858295,
1176
+ "learning_rate": 0.00027449168207024027,
1177
+ "loss": 0.0031,
1178
  "step": 3975
1179
  },
1180
  {
1181
  "epoch": 7.33,
1182
+ "grad_norm": 0.004116680007427931,
1183
+ "learning_rate": 0.00026987060998151567,
1184
+ "loss": 0.0026,
1185
  "step": 4000
1186
  },
1187
  {
1188
  "epoch": 7.37,
1189
+ "grad_norm": 0.03917807340621948,
1190
+ "learning_rate": 0.00026524953789279113,
1191
+ "loss": 0.0038,
1192
  "step": 4025
1193
  },
1194
  {
1195
  "epoch": 7.42,
1196
+ "grad_norm": 0.0030583201441913843,
1197
+ "learning_rate": 0.0002606284658040666,
1198
+ "loss": 0.0032,
1199
  "step": 4050
1200
  },
1201
  {
1202
  "epoch": 7.46,
1203
+ "grad_norm": 0.0014874553307890892,
1204
+ "learning_rate": 0.000256007393715342,
1205
+ "loss": 0.0054,
1206
  "step": 4075
1207
  },
1208
  {
1209
  "epoch": 7.51,
1210
+ "grad_norm": 0.0008628646028228104,
1211
+ "learning_rate": 0.0002513863216266174,
1212
+ "loss": 0.0019,
1213
  "step": 4100
1214
  },
1215
  {
1216
  "epoch": 7.55,
1217
+ "grad_norm": 0.02715575322508812,
1218
+ "learning_rate": 0.00024676524953789276,
1219
+ "loss": 0.0037,
1220
  "step": 4125
1221
  },
1222
  {
1223
  "epoch": 7.6,
1224
+ "grad_norm": 0.0031906655058264732,
1225
+ "learning_rate": 0.00024214417744916822,
1226
+ "loss": 0.0058,
1227
  "step": 4150
1228
  },
1229
  {
1230
  "epoch": 7.65,
1231
+ "grad_norm": 0.011863148771226406,
1232
+ "learning_rate": 0.00023752310536044363,
1233
+ "loss": 0.0022,
1234
  "step": 4175
1235
  },
1236
  {
1237
  "epoch": 7.69,
1238
+ "grad_norm": 0.0015202141366899014,
1239
+ "learning_rate": 0.00023290203327171904,
1240
+ "loss": 0.0045,
1241
  "step": 4200
1242
  },
1243
  {
1244
  "epoch": 7.74,
1245
+ "grad_norm": 0.02240474335849285,
1246
+ "learning_rate": 0.00022828096118299447,
1247
+ "loss": 0.0039,
1248
  "step": 4225
1249
  },
1250
  {
1251
  "epoch": 7.78,
1252
+ "grad_norm": 0.00918908603489399,
1253
+ "learning_rate": 0.00022365988909426988,
1254
+ "loss": 0.0014,
1255
  "step": 4250
1256
  },
1257
  {
1258
  "epoch": 7.83,
1259
+ "grad_norm": 0.005950120277702808,
1260
+ "learning_rate": 0.0002190388170055453,
1261
+ "loss": 0.0031,
1262
  "step": 4275
1263
  },
1264
  {
1265
  "epoch": 7.88,
1266
+ "grad_norm": 0.07433830946683884,
1267
+ "learning_rate": 0.0002144177449168207,
1268
+ "loss": 0.002,
1269
  "step": 4300
1270
  },
1271
  {
1272
  "epoch": 7.92,
1273
+ "grad_norm": 0.09878811240196228,
1274
+ "learning_rate": 0.0002097966728280961,
1275
+ "loss": 0.0039,
1276
  "step": 4325
1277
  },
1278
  {
1279
  "epoch": 7.97,
1280
+ "grad_norm": 0.004627088084816933,
1281
+ "learning_rate": 0.00020517560073937154,
1282
+ "loss": 0.0066,
1283
  "step": 4350
1284
  },
1285
  {
1286
  "epoch": 8.0,
1287
+ "eval_loss": 0.21887589991092682,
1288
+ "eval_runtime": 464.2458,
1289
+ "eval_samples_per_second": 1.751,
1290
+ "eval_steps_per_second": 0.293,
1291
  "step": 4368
1292
  },
1293
  {
1294
  "epoch": 8.01,
1295
+ "grad_norm": 0.016953645274043083,
1296
+ "learning_rate": 0.00020055452865064697,
1297
+ "loss": 0.002,
1298
  "step": 4375
1299
  },
1300
  {
1301
  "epoch": 8.06,
1302
+ "grad_norm": 0.00016232863708864897,
1303
+ "learning_rate": 0.00019593345656192238,
1304
+ "loss": 0.0022,
1305
  "step": 4400
1306
  },
1307
  {
1308
  "epoch": 8.1,
1309
+ "grad_norm": 0.00045125139877200127,
1310
+ "learning_rate": 0.00019131238447319779,
1311
+ "loss": 0.0016,
1312
  "step": 4425
1313
  },
1314
  {
1315
  "epoch": 8.15,
1316
+ "grad_norm": 0.02065761759877205,
1317
+ "learning_rate": 0.0001866913123844732,
1318
+ "loss": 0.0017,
1319
  "step": 4450
1320
  },
1321
  {
1322
  "epoch": 8.2,
1323
+ "grad_norm": 0.042185261845588684,
1324
+ "learning_rate": 0.00018207024029574863,
1325
+ "loss": 0.0027,
1326
  "step": 4475
1327
  },
1328
  {
1329
  "epoch": 8.24,
1330
+ "grad_norm": 0.003087196499109268,
1331
+ "learning_rate": 0.00017744916820702404,
1332
+ "loss": 0.0018,
1333
  "step": 4500
1334
  },
1335
  {
1336
  "epoch": 8.29,
1337
+ "grad_norm": 0.02859407104551792,
1338
+ "learning_rate": 0.00017282809611829944,
1339
+ "loss": 0.0015,
1340
  "step": 4525
1341
  },
1342
  {
1343
  "epoch": 8.33,
1344
+ "grad_norm": 0.00041793755372054875,
1345
+ "learning_rate": 0.00016820702402957485,
1346
+ "loss": 0.0035,
1347
  "step": 4550
1348
  },
1349
  {
1350
  "epoch": 8.38,
1351
+ "grad_norm": 0.0037734461948275566,
1352
+ "learning_rate": 0.00016358595194085026,
1353
+ "loss": 0.002,
1354
  "step": 4575
1355
  },
1356
  {
1357
  "epoch": 8.42,
1358
+ "grad_norm": 0.0030207443051040173,
1359
+ "learning_rate": 0.00015896487985212572,
1360
+ "loss": 0.0022,
1361
  "step": 4600
1362
  },
1363
  {
1364
  "epoch": 8.47,
1365
+ "grad_norm": 0.0026946039870381355,
1366
+ "learning_rate": 0.00015434380776340113,
1367
+ "loss": 0.0028,
1368
  "step": 4625
1369
  },
1370
  {
1371
  "epoch": 8.52,
1372
+ "grad_norm": 0.041892848908901215,
1373
+ "learning_rate": 0.00014972273567467653,
1374
+ "loss": 0.001,
1375
  "step": 4650
1376
  },
1377
  {
1378
  "epoch": 8.56,
1379
+ "grad_norm": 0.06906843930482864,
1380
+ "learning_rate": 0.00014510166358595194,
1381
+ "loss": 0.0013,
1382
  "step": 4675
1383
  },
1384
  {
1385
  "epoch": 8.61,
1386
+ "grad_norm": 0.00067297019995749,
1387
+ "learning_rate": 0.00014048059149722737,
1388
+ "loss": 0.0029,
1389
  "step": 4700
1390
  },
1391
  {
1392
  "epoch": 8.65,
1393
+ "grad_norm": 0.011746911332011223,
1394
+ "learning_rate": 0.00013585951940850278,
1395
+ "loss": 0.0012,
1396
  "step": 4725
1397
  },
1398
  {
1399
  "epoch": 8.7,
1400
+ "grad_norm": 0.0013995037879794836,
1401
+ "learning_rate": 0.0001312384473197782,
1402
+ "loss": 0.0017,
1403
  "step": 4750
1404
  },
1405
  {
1406
  "epoch": 8.75,
1407
+ "grad_norm": 0.009580204263329506,
1408
+ "learning_rate": 0.0001266173752310536,
1409
+ "loss": 0.001,
1410
  "step": 4775
1411
  },
1412
  {
1413
  "epoch": 8.79,
1414
+ "grad_norm": 0.0008843488758429885,
1415
+ "learning_rate": 0.00012199630314232903,
1416
+ "loss": 0.0015,
1417
  "step": 4800
1418
  },
1419
  {
1420
  "epoch": 8.84,
1421
+ "grad_norm": 0.0013571062590926886,
1422
+ "learning_rate": 0.00011737523105360444,
1423
+ "loss": 0.0024,
1424
  "step": 4825
1425
  },
1426
  {
1427
  "epoch": 8.88,
1428
+ "grad_norm": 0.01475840900093317,
1429
+ "learning_rate": 0.00011275415896487985,
1430
+ "loss": 0.0009,
1431
  "step": 4850
1432
  },
1433
  {
1434
  "epoch": 8.93,
1435
+ "grad_norm": 0.008486463688313961,
1436
+ "learning_rate": 0.00010813308687615527,
1437
+ "loss": 0.0022,
1438
  "step": 4875
1439
  },
1440
  {
1441
  "epoch": 8.97,
1442
+ "grad_norm": 0.0005981879075989127,
1443
+ "learning_rate": 0.00010351201478743069,
1444
+ "loss": 0.0006,
1445
  "step": 4900
1446
  },
1447
  {
1448
  "epoch": 9.0,
1449
+ "eval_loss": 0.23254649341106415,
1450
+ "eval_runtime": 463.7576,
1451
+ "eval_samples_per_second": 1.753,
1452
+ "eval_steps_per_second": 0.293,
1453
  "step": 4914
1454
  },
1455
  {
1456
  "epoch": 9.02,
1457
+ "grad_norm": 0.011784604750573635,
1458
+ "learning_rate": 9.889094269870611e-05,
1459
+ "loss": 0.0028,
1460
  "step": 4925
1461
  },
1462
  {
1463
  "epoch": 9.07,
1464
+ "grad_norm": 0.0005488657625392079,
1465
+ "learning_rate": 9.426987060998152e-05,
1466
+ "loss": 0.0016,
1467
  "step": 4950
1468
  },
1469
  {
1470
  "epoch": 9.11,
1471
+ "grad_norm": 0.0024228901602327824,
1472
+ "learning_rate": 8.964879852125694e-05,
1473
+ "loss": 0.001,
1474
  "step": 4975
1475
  },
1476
  {
1477
  "epoch": 9.16,
1478
+ "grad_norm": 0.0021140037570148706,
1479
+ "learning_rate": 8.502772643253234e-05,
1480
+ "loss": 0.0014,
1481
  "step": 5000
1482
  },
1483
  {
1484
  "epoch": 9.2,
1485
+ "grad_norm": 0.0011844311375170946,
1486
+ "learning_rate": 8.040665434380776e-05,
1487
+ "loss": 0.001,
1488
  "step": 5025
1489
  },
1490
  {
1491
  "epoch": 9.25,
1492
+ "grad_norm": 0.011841055937111378,
1493
+ "learning_rate": 7.578558225508319e-05,
1494
+ "loss": 0.0009,
1495
  "step": 5050
1496
  },
1497
  {
1498
  "epoch": 9.29,
1499
+ "grad_norm": 0.013395372778177261,
1500
+ "learning_rate": 7.116451016635859e-05,
1501
+ "loss": 0.0009,
1502
  "step": 5075
1503
  },
1504
  {
1505
  "epoch": 9.34,
1506
+ "grad_norm": 0.05545121058821678,
1507
+ "learning_rate": 6.654343807763401e-05,
1508
+ "loss": 0.0012,
1509
  "step": 5100
1510
  },
1511
  {
1512
  "epoch": 9.39,
1513
+ "grad_norm": 0.01891588233411312,
1514
+ "learning_rate": 6.192236598890943e-05,
1515
+ "loss": 0.0006,
1516
  "step": 5125
1517
  },
1518
  {
1519
  "epoch": 9.43,
1520
+ "grad_norm": 0.0025335114914923906,
1521
+ "learning_rate": 5.730129390018484e-05,
1522
+ "loss": 0.0006,
1523
  "step": 5150
1524
  },
1525
  {
1526
  "epoch": 9.48,
1527
+ "grad_norm": 0.0021167423110455275,
1528
+ "learning_rate": 5.268022181146026e-05,
1529
+ "loss": 0.0007,
1530
  "step": 5175
1531
  },
1532
  {
1533
  "epoch": 9.52,
1534
+ "grad_norm": 0.0011415353510528803,
1535
+ "learning_rate": 4.8059149722735676e-05,
1536
+ "loss": 0.0014,
1537
  "step": 5200
1538
  },
1539
  {
1540
  "epoch": 9.57,
1541
+ "grad_norm": 0.00026013093884103,
1542
+ "learning_rate": 4.343807763401109e-05,
1543
+ "loss": 0.0007,
1544
  "step": 5225
1545
  },
1546
  {
1547
  "epoch": 9.62,
1548
+ "grad_norm": 0.03879648819565773,
1549
+ "learning_rate": 3.8817005545286504e-05,
1550
+ "loss": 0.0007,
1551
  "step": 5250
1552
  },
1553
  {
1554
  "epoch": 9.66,
1555
+ "grad_norm": 0.006720875855535269,
1556
+ "learning_rate": 3.4195933456561925e-05,
1557
+ "loss": 0.0009,
1558
  "step": 5275
1559
  },
1560
  {
1561
  "epoch": 9.71,
1562
+ "grad_norm": 0.006371485069394112,
1563
+ "learning_rate": 2.957486136783734e-05,
1564
+ "loss": 0.0009,
1565
  "step": 5300
1566
  },
1567
  {
1568
  "epoch": 9.75,
1569
+ "grad_norm": 0.012291524559259415,
1570
+ "learning_rate": 2.4953789279112753e-05,
1571
+ "loss": 0.0012,
1572
  "step": 5325
1573
  },
1574
  {
1575
  "epoch": 9.8,
1576
+ "grad_norm": 0.012388636358082294,
1577
+ "learning_rate": 2.033271719038817e-05,
1578
+ "loss": 0.0006,
1579
  "step": 5350
1580
  },
1581
  {
1582
  "epoch": 9.84,
1583
+ "grad_norm": 0.0905984491109848,
1584
+ "learning_rate": 1.5711645101663588e-05,
1585
+ "loss": 0.0011,
1586
  "step": 5375
1587
  },
1588
  {
1589
  "epoch": 9.89,
1590
+ "grad_norm": 0.0024207117967307568,
1591
+ "learning_rate": 1.1090573012939002e-05,
1592
+ "loss": 0.001,
1593
  "step": 5400
1594
  },
1595
  {
1596
  "epoch": 9.94,
1597
+ "grad_norm": 0.003070174716413021,
1598
+ "learning_rate": 6.469500924214418e-06,
1599
+ "loss": 0.0008,
1600
  "step": 5425
1601
  },
1602
  {
1603
  "epoch": 9.98,
1604
+ "grad_norm": 0.012533812783658504,
1605
+ "learning_rate": 1.8484288354898337e-06,
1606
+ "loss": 0.001,
1607
  "step": 5450
1608
  },
1609
  {
1610
  "epoch": 10.0,
1611
+ "eval_loss": 0.23037216067314148,
1612
+ "eval_runtime": 463.1996,
1613
+ "eval_samples_per_second": 1.755,
1614
+ "eval_steps_per_second": 0.294,
1615
  "step": 5460
1616
  },
1617
  {
1618
  "epoch": 10.0,
1619
  "step": 5460,
1620
  "total_flos": 1.135723105419264e+20,
1621
+ "train_loss": 0.029495533068592733,
1622
+ "train_runtime": 29108.855,
1623
+ "train_samples_per_second": 1.124,
1624
  "train_steps_per_second": 0.188
1625
  }
1626
  ],