MohamedAhmedAE commited on
Commit
e25e764
1 Parent(s): 1d624d1

Training in progress, step 14000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2172da14deb5ca9f0e6f14d8f6ea79f495f134705213512eb214acc738ae1a2b
3
  size 1715561468
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba212990d194dca4f44ced4ca63ccf9ef184764cc7f8172c878fbac719c06d49
3
  size 1715561468
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98ea6b956ce4901179558c59f0fb06e1bc9747de38c7f9ca931ce9b196acb81d
3
  size 3431474364
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaf08772128513652918e0bba4e572e2c66384e3a11c4c8c40940608c79e2518
3
  size 3431474364
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6d8bcb19c39d66027f3d7d9fbbeb3905ddad266891b5c1549b07d8b38abb6d1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:811402e95375709703215e6c97e1c03c4e9ff165e81e964713fe6305f44ed804
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdbc17eea2364605baa35c3a731dcb44b37215b8ffcd0664293786485aa92f95
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f2d0d921e9513c8f152071073b4ae78b683c89f70f41659678cef15c9b0b508
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.1257974660796118,
5
  "eval_steps": 1000,
6
- "global_step": 2800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -121,6 +121,494 @@
121
  "learning_rate": 4.999922185102915e-05,
122
  "loss": 1.0514,
123
  "step": 2800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  }
125
  ],
126
  "logging_steps": 200,
@@ -140,7 +628,7 @@
140
  "attributes": {}
141
  }
142
  },
143
- "total_flos": 4.01409263517696e+16,
144
  "train_batch_size": 6,
145
  "trial_name": null,
146
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6289873303980591,
5
  "eval_steps": 1000,
6
+ "global_step": 14000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
121
  "learning_rate": 4.999922185102915e-05,
122
  "loss": 1.0514,
123
  "step": 2800
124
+ },
125
+ {
126
+ "epoch": 0.13478299937101268,
127
+ "grad_norm": 2.352109432220459,
128
+ "learning_rate": 4.9999106504350065e-05,
129
+ "loss": 1.0327,
130
+ "step": 3000
131
+ },
132
+ {
133
+ "epoch": 0.13478299937101268,
134
+ "eval_loss": 3.369852066040039,
135
+ "eval_runtime": 1064.8233,
136
+ "eval_samples_per_second": 9.301,
137
+ "eval_steps_per_second": 0.146,
138
+ "step": 3000
139
+ },
140
+ {
141
+ "epoch": 0.14376853266241352,
142
+ "grad_norm": 0.7272612452507019,
143
+ "learning_rate": 4.999898318918469e-05,
144
+ "loss": 1.0379,
145
+ "step": 3200
146
+ },
147
+ {
148
+ "epoch": 0.15275406595381436,
149
+ "grad_norm": 1.021616816520691,
150
+ "learning_rate": 4.999885190557234e-05,
151
+ "loss": 1.0416,
152
+ "step": 3400
153
+ },
154
+ {
155
+ "epoch": 0.1617395992452152,
156
+ "grad_norm": 2.4565377235412598,
157
+ "learning_rate": 4.999871265355485e-05,
158
+ "loss": 1.0212,
159
+ "step": 3600
160
+ },
161
+ {
162
+ "epoch": 0.17072513253661606,
163
+ "grad_norm": 20.56285858154297,
164
+ "learning_rate": 4.9998565433176624e-05,
165
+ "loss": 1.0219,
166
+ "step": 3800
167
+ },
168
+ {
169
+ "epoch": 0.1797106658280169,
170
+ "grad_norm": 0.7909038662910461,
171
+ "learning_rate": 4.9998410244484574e-05,
172
+ "loss": 1.0075,
173
+ "step": 4000
174
+ },
175
+ {
176
+ "epoch": 0.1797106658280169,
177
+ "eval_loss": 3.339078903198242,
178
+ "eval_runtime": 1066.4833,
179
+ "eval_samples_per_second": 9.287,
180
+ "eval_steps_per_second": 0.145,
181
+ "step": 4000
182
+ },
183
+ {
184
+ "epoch": 0.18869619911941773,
185
+ "grad_norm": 2.09454607963562,
186
+ "learning_rate": 4.999824708752817e-05,
187
+ "loss": 0.9825,
188
+ "step": 4200
189
+ },
190
+ {
191
+ "epoch": 0.19768173241081857,
192
+ "grad_norm": 2.223658323287964,
193
+ "learning_rate": 4.999807596235943e-05,
194
+ "loss": 0.9851,
195
+ "step": 4400
196
+ },
197
+ {
198
+ "epoch": 0.20666726570221944,
199
+ "grad_norm": 1.121969223022461,
200
+ "learning_rate": 4.999789686903289e-05,
201
+ "loss": 1.0041,
202
+ "step": 4600
203
+ },
204
+ {
205
+ "epoch": 0.21565279899362028,
206
+ "grad_norm": 4.0251312255859375,
207
+ "learning_rate": 4.9997709807605626e-05,
208
+ "loss": 0.9841,
209
+ "step": 4800
210
+ },
211
+ {
212
+ "epoch": 0.22463833228502111,
213
+ "grad_norm": 1.6437472105026245,
214
+ "learning_rate": 4.9997514778137275e-05,
215
+ "loss": 0.9483,
216
+ "step": 5000
217
+ },
218
+ {
219
+ "epoch": 0.22463833228502111,
220
+ "eval_loss": 3.2980644702911377,
221
+ "eval_runtime": 1067.9785,
222
+ "eval_samples_per_second": 9.274,
223
+ "eval_steps_per_second": 0.145,
224
+ "step": 5000
225
+ },
226
+ {
227
+ "epoch": 0.23362386557642195,
228
+ "grad_norm": 0.8991021513938904,
229
+ "learning_rate": 4.999731178069001e-05,
230
+ "loss": 0.9541,
231
+ "step": 5200
232
+ },
233
+ {
234
+ "epoch": 0.24260939886782282,
235
+ "grad_norm": 3.1451597213745117,
236
+ "learning_rate": 4.999710081532853e-05,
237
+ "loss": 0.9589,
238
+ "step": 5400
239
+ },
240
+ {
241
+ "epoch": 0.2515949321592236,
242
+ "grad_norm": 2.142390489578247,
243
+ "learning_rate": 4.999688188212007e-05,
244
+ "loss": 0.9677,
245
+ "step": 5600
246
+ },
247
+ {
248
+ "epoch": 0.2605804654506245,
249
+ "grad_norm": 2.2872331142425537,
250
+ "learning_rate": 4.999665498113444e-05,
251
+ "loss": 0.962,
252
+ "step": 5800
253
+ },
254
+ {
255
+ "epoch": 0.26956599874202536,
256
+ "grad_norm": 2.730259418487549,
257
+ "learning_rate": 4.999642011244394e-05,
258
+ "loss": 0.9581,
259
+ "step": 6000
260
+ },
261
+ {
262
+ "epoch": 0.26956599874202536,
263
+ "eval_loss": 3.3341598510742188,
264
+ "eval_runtime": 1066.5406,
265
+ "eval_samples_per_second": 9.286,
266
+ "eval_steps_per_second": 0.145,
267
+ "step": 6000
268
+ },
269
+ {
270
+ "epoch": 0.2785515320334262,
271
+ "grad_norm": 2.8416945934295654,
272
+ "learning_rate": 4.999617727612344e-05,
273
+ "loss": 0.9675,
274
+ "step": 6200
275
+ },
276
+ {
277
+ "epoch": 0.28753706532482703,
278
+ "grad_norm": 2.8148677349090576,
279
+ "learning_rate": 4.9995926472250356e-05,
280
+ "loss": 0.9411,
281
+ "step": 6400
282
+ },
283
+ {
284
+ "epoch": 0.2965225986162279,
285
+ "grad_norm": 1.3317234516143799,
286
+ "learning_rate": 4.999566770090462e-05,
287
+ "loss": 0.9279,
288
+ "step": 6600
289
+ },
290
+ {
291
+ "epoch": 0.3055081319076287,
292
+ "grad_norm": 3.403902053833008,
293
+ "learning_rate": 4.999540096216872e-05,
294
+ "loss": 0.9293,
295
+ "step": 6800
296
+ },
297
+ {
298
+ "epoch": 0.31449366519902955,
299
+ "grad_norm": 1.70892333984375,
300
+ "learning_rate": 4.9995126256127675e-05,
301
+ "loss": 0.9475,
302
+ "step": 7000
303
+ },
304
+ {
305
+ "epoch": 0.31449366519902955,
306
+ "eval_loss": 3.238970994949341,
307
+ "eval_runtime": 1068.527,
308
+ "eval_samples_per_second": 9.269,
309
+ "eval_steps_per_second": 0.145,
310
+ "step": 7000
311
+ },
312
+ {
313
+ "epoch": 0.3234791984904304,
314
+ "grad_norm": 3.11971378326416,
315
+ "learning_rate": 4.999484358286907e-05,
316
+ "loss": 0.9465,
317
+ "step": 7200
318
+ },
319
+ {
320
+ "epoch": 0.3324647317818312,
321
+ "grad_norm": 1.395370364189148,
322
+ "learning_rate": 4.9994552942482975e-05,
323
+ "loss": 0.9445,
324
+ "step": 7400
325
+ },
326
+ {
327
+ "epoch": 0.3414502650732321,
328
+ "grad_norm": 6.5639424324035645,
329
+ "learning_rate": 4.999425433506204e-05,
330
+ "loss": 0.9263,
331
+ "step": 7600
332
+ },
333
+ {
334
+ "epoch": 0.35043579836463296,
335
+ "grad_norm": 2.2011075019836426,
336
+ "learning_rate": 4.999394776070146e-05,
337
+ "loss": 0.9193,
338
+ "step": 7800
339
+ },
340
+ {
341
+ "epoch": 0.3594213316560338,
342
+ "grad_norm": 2.9525458812713623,
343
+ "learning_rate": 4.999363321949895e-05,
344
+ "loss": 0.9405,
345
+ "step": 8000
346
+ },
347
+ {
348
+ "epoch": 0.3594213316560338,
349
+ "eval_loss": 3.2370519638061523,
350
+ "eval_runtime": 1068.6545,
351
+ "eval_samples_per_second": 9.268,
352
+ "eval_steps_per_second": 0.145,
353
+ "step": 8000
354
+ },
355
+ {
356
+ "epoch": 0.36840686494743463,
357
+ "grad_norm": 4.726866245269775,
358
+ "learning_rate": 4.999331071155477e-05,
359
+ "loss": 0.9391,
360
+ "step": 8200
361
+ },
362
+ {
363
+ "epoch": 0.37739239823883547,
364
+ "grad_norm": 2.23179292678833,
365
+ "learning_rate": 4.9992980236971723e-05,
366
+ "loss": 0.9352,
367
+ "step": 8400
368
+ },
369
+ {
370
+ "epoch": 0.3863779315302363,
371
+ "grad_norm": 2.175626516342163,
372
+ "learning_rate": 4.9992641795855134e-05,
373
+ "loss": 0.9359,
374
+ "step": 8600
375
+ },
376
+ {
377
+ "epoch": 0.39536346482163714,
378
+ "grad_norm": 5.489994525909424,
379
+ "learning_rate": 4.9992295388312895e-05,
380
+ "loss": 0.918,
381
+ "step": 8800
382
+ },
383
+ {
384
+ "epoch": 0.404348998113038,
385
+ "grad_norm": 1.484823226928711,
386
+ "learning_rate": 4.9991941014455414e-05,
387
+ "loss": 0.9075,
388
+ "step": 9000
389
+ },
390
+ {
391
+ "epoch": 0.404348998113038,
392
+ "eval_loss": 3.1722910404205322,
393
+ "eval_runtime": 1070.0307,
394
+ "eval_samples_per_second": 9.256,
395
+ "eval_steps_per_second": 0.145,
396
+ "step": 9000
397
+ },
398
+ {
399
+ "epoch": 0.4133345314044389,
400
+ "grad_norm": 1.1743195056915283,
401
+ "learning_rate": 4.9991578674395656e-05,
402
+ "loss": 0.9116,
403
+ "step": 9200
404
+ },
405
+ {
406
+ "epoch": 0.4223200646958397,
407
+ "grad_norm": 4.027889728546143,
408
+ "learning_rate": 4.999120836824912e-05,
409
+ "loss": 0.9023,
410
+ "step": 9400
411
+ },
412
+ {
413
+ "epoch": 0.43130559798724055,
414
+ "grad_norm": 3.1647088527679443,
415
+ "learning_rate": 4.9990830096133826e-05,
416
+ "loss": 0.8992,
417
+ "step": 9600
418
+ },
419
+ {
420
+ "epoch": 0.4402911312786414,
421
+ "grad_norm": 1.6494026184082031,
422
+ "learning_rate": 4.9990443858170366e-05,
423
+ "loss": 0.8881,
424
+ "step": 9800
425
+ },
426
+ {
427
+ "epoch": 0.44927666457004223,
428
+ "grad_norm": 2.5967679023742676,
429
+ "learning_rate": 4.999004965448184e-05,
430
+ "loss": 0.8889,
431
+ "step": 10000
432
+ },
433
+ {
434
+ "epoch": 0.44927666457004223,
435
+ "eval_loss": 3.1767914295196533,
436
+ "eval_runtime": 1067.4091,
437
+ "eval_samples_per_second": 9.279,
438
+ "eval_steps_per_second": 0.145,
439
+ "step": 10000
440
+ },
441
+ {
442
+ "epoch": 0.45826219786144307,
443
+ "grad_norm": 2.703774929046631,
444
+ "learning_rate": 4.998964748519391e-05,
445
+ "loss": 0.8845,
446
+ "step": 10200
447
+ },
448
+ {
449
+ "epoch": 0.4672477311528439,
450
+ "grad_norm": 5.934618949890137,
451
+ "learning_rate": 4.998923735043477e-05,
452
+ "loss": 0.899,
453
+ "step": 10400
454
+ },
455
+ {
456
+ "epoch": 0.47623326444424474,
457
+ "grad_norm": 7.952963352203369,
458
+ "learning_rate": 4.9988819250335136e-05,
459
+ "loss": 0.8968,
460
+ "step": 10600
461
+ },
462
+ {
463
+ "epoch": 0.48521879773564563,
464
+ "grad_norm": 3.2846908569335938,
465
+ "learning_rate": 4.99883931850283e-05,
466
+ "loss": 0.8687,
467
+ "step": 10800
468
+ },
469
+ {
470
+ "epoch": 0.4942043310270465,
471
+ "grad_norm": 1.9633086919784546,
472
+ "learning_rate": 4.998795915465005e-05,
473
+ "loss": 0.8537,
474
+ "step": 11000
475
+ },
476
+ {
477
+ "epoch": 0.4942043310270465,
478
+ "eval_loss": 3.1828198432922363,
479
+ "eval_runtime": 1068.8128,
480
+ "eval_samples_per_second": 9.266,
481
+ "eval_steps_per_second": 0.145,
482
+ "step": 11000
483
+ },
484
+ {
485
+ "epoch": 0.5031898643184473,
486
+ "grad_norm": 6.807458400726318,
487
+ "learning_rate": 4.9987517159338744e-05,
488
+ "loss": 0.8482,
489
+ "step": 11200
490
+ },
491
+ {
492
+ "epoch": 0.5121753976098481,
493
+ "grad_norm": 2.9921388626098633,
494
+ "learning_rate": 4.998706719923526e-05,
495
+ "loss": 0.8662,
496
+ "step": 11400
497
+ },
498
+ {
499
+ "epoch": 0.521160930901249,
500
+ "grad_norm": 0.7828212380409241,
501
+ "learning_rate": 4.998660927448304e-05,
502
+ "loss": 0.88,
503
+ "step": 11600
504
+ },
505
+ {
506
+ "epoch": 0.5301464641926499,
507
+ "grad_norm": 3.1086294651031494,
508
+ "learning_rate": 4.9986143385228026e-05,
509
+ "loss": 0.8536,
510
+ "step": 11800
511
+ },
512
+ {
513
+ "epoch": 0.5391319974840507,
514
+ "grad_norm": 3.759007453918457,
515
+ "learning_rate": 4.998566953161874e-05,
516
+ "loss": 0.8321,
517
+ "step": 12000
518
+ },
519
+ {
520
+ "epoch": 0.5391319974840507,
521
+ "eval_loss": 3.1765565872192383,
522
+ "eval_runtime": 1069.9445,
523
+ "eval_samples_per_second": 9.257,
524
+ "eval_steps_per_second": 0.145,
525
+ "step": 12000
526
+ },
527
+ {
528
+ "epoch": 0.5481175307754516,
529
+ "grad_norm": 4.347619533538818,
530
+ "learning_rate": 4.9985187713806206e-05,
531
+ "loss": 0.8713,
532
+ "step": 12200
533
+ },
534
+ {
535
+ "epoch": 0.5571030640668524,
536
+ "grad_norm": 2.748655080795288,
537
+ "learning_rate": 4.9984697931944024e-05,
538
+ "loss": 0.8457,
539
+ "step": 12400
540
+ },
541
+ {
542
+ "epoch": 0.5660885973582532,
543
+ "grad_norm": 2.891540288925171,
544
+ "learning_rate": 4.998420018618829e-05,
545
+ "loss": 0.8212,
546
+ "step": 12600
547
+ },
548
+ {
549
+ "epoch": 0.5750741306496541,
550
+ "grad_norm": 4.089766025543213,
551
+ "learning_rate": 4.998369447669768e-05,
552
+ "loss": 0.8288,
553
+ "step": 12800
554
+ },
555
+ {
556
+ "epoch": 0.5840596639410549,
557
+ "grad_norm": 4.722995758056641,
558
+ "learning_rate": 4.9983180803633376e-05,
559
+ "loss": 0.8757,
560
+ "step": 13000
561
+ },
562
+ {
563
+ "epoch": 0.5840596639410549,
564
+ "eval_loss": 3.168459892272949,
565
+ "eval_runtime": 1070.7464,
566
+ "eval_samples_per_second": 9.25,
567
+ "eval_steps_per_second": 0.145,
568
+ "step": 13000
569
+ },
570
+ {
571
+ "epoch": 0.5930451972324557,
572
+ "grad_norm": 7.390491008758545,
573
+ "learning_rate": 4.998265916715912e-05,
574
+ "loss": 0.8477,
575
+ "step": 13200
576
+ },
577
+ {
578
+ "epoch": 0.6020307305238566,
579
+ "grad_norm": 2.4633262157440186,
580
+ "learning_rate": 4.9982129567441185e-05,
581
+ "loss": 0.8415,
582
+ "step": 13400
583
+ },
584
+ {
585
+ "epoch": 0.6110162638152574,
586
+ "grad_norm": 5.4892473220825195,
587
+ "learning_rate": 4.998159200464837e-05,
588
+ "loss": 0.8176,
589
+ "step": 13600
590
+ },
591
+ {
592
+ "epoch": 0.6200017971066583,
593
+ "grad_norm": 4.862381458282471,
594
+ "learning_rate": 4.998104647895203e-05,
595
+ "loss": 0.8336,
596
+ "step": 13800
597
+ },
598
+ {
599
+ "epoch": 0.6289873303980591,
600
+ "grad_norm": 8.079172134399414,
601
+ "learning_rate": 4.998049299052606e-05,
602
+ "loss": 0.8147,
603
+ "step": 14000
604
+ },
605
+ {
606
+ "epoch": 0.6289873303980591,
607
+ "eval_loss": 3.1354148387908936,
608
+ "eval_runtime": 1070.1274,
609
+ "eval_samples_per_second": 9.255,
610
+ "eval_steps_per_second": 0.145,
611
+ "step": 14000
612
  }
613
  ],
614
  "logging_steps": 200,
 
628
  "attributes": {}
629
  }
630
  },
631
+ "total_flos": 2.00704631758848e+17,
632
  "train_batch_size": 6,
633
  "trial_name": null,
634
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1541949d48e8cbfda4e239106fbb1c4580dc3f4206861a71b9bbccdf63200f9c
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:befd02aaf8966ad3b2e0325fdff20577e54e5843141cd14048ed8a8fb00ce681
3
  size 5240