barbaroo commited on
Commit
7a2154d
1 Parent(s): 0bb3135

Upload 13 files

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75062e60de22a5d873b6ce8e79ae9db4e533f4767b4f66218660a005800858a8
3
  size 4999863872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36f4082807a4a91ad2133d752fd587814a29439888c9a3db8d97583619044e9f
3
  size 4999863872
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ef9b77c4c19d82f30dafb4f2cb9dd20f7459d18ca029b5770272dcca5a9718a
3
  size 482809696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fb02e929fe4112ec14b7703127a33f20068fba2973643efa9ba4ddf3fce3aea
3
  size 482809696
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cd1ba6163965f93c56f9d9d3074e781525d5b3b78f81275ac09db230b4c4827
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:529fe7f11f242c337c0f1a103ac077595ea1499207d09d52224640ca121e2a94
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd5acb8703ce3553c1eb28840ca61a21bf710c4b95062c0ce822e72326bdacac
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e165e1f46e8a630f28d55072c8f17768fb98c886bdc1313ee5c85479b03c11ab
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.6402289867401123,
3
- "best_model_checkpoint": "nllb_200_distilled_1.3B_ENtoFO_bsz_64_epochs_10lr0.0001/checkpoint-7500",
4
- "epoch": 2.635885382917266,
5
  "eval_steps": 500,
6
- "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -45,13 +45,13 @@
45
  },
46
  {
47
  "epoch": 0.17572569219448442,
48
- "eval_bleu": 39.6289,
49
- "eval_chrf++": 58.7253,
50
- "eval_gen_len": 17.6964,
51
- "eval_loss": 0.8153083920478821,
52
- "eval_runtime": 3428.0218,
53
- "eval_samples_per_second": 2.136,
54
- "eval_steps_per_second": 1.068,
55
  "step": 500
56
  },
57
  {
@@ -91,13 +91,13 @@
91
  },
92
  {
93
  "epoch": 0.35145138438896883,
94
- "eval_bleu": 41.3868,
95
- "eval_chrf++": 60.2285,
96
- "eval_gen_len": 17.8309,
97
- "eval_loss": 0.7573392987251282,
98
- "eval_runtime": 3438.4799,
99
- "eval_samples_per_second": 2.129,
100
- "eval_steps_per_second": 1.065,
101
  "step": 1000
102
  },
103
  {
@@ -137,13 +137,13 @@
137
  },
138
  {
139
  "epoch": 0.5271770765834533,
140
- "eval_bleu": 41.8145,
141
- "eval_chrf++": 60.6355,
142
- "eval_gen_len": 17.7981,
143
- "eval_loss": 0.7231032252311707,
144
- "eval_runtime": 3422.1307,
145
- "eval_samples_per_second": 2.139,
146
- "eval_steps_per_second": 1.07,
147
  "step": 1500
148
  },
149
  {
@@ -183,13 +183,13 @@
183
  },
184
  {
185
  "epoch": 0.7029027687779377,
186
- "eval_bleu": 42.7477,
187
- "eval_chrf++": 61.3889,
188
- "eval_gen_len": 17.7144,
189
- "eval_loss": 0.6981974244117737,
190
- "eval_runtime": 3442.4579,
191
- "eval_samples_per_second": 2.127,
192
- "eval_steps_per_second": 1.063,
193
  "step": 2000
194
  },
195
  {
@@ -229,13 +229,13 @@
229
  },
230
  {
231
  "epoch": 0.8786284609724221,
232
- "eval_bleu": 43.5398,
233
- "eval_chrf++": 61.9091,
234
- "eval_gen_len": 17.8354,
235
- "eval_loss": 0.6773364543914795,
236
- "eval_runtime": 3438.2409,
237
- "eval_samples_per_second": 2.129,
238
- "eval_steps_per_second": 1.065,
239
  "step": 2500
240
  },
241
  {
@@ -275,13 +275,13 @@
275
  },
276
  {
277
  "epoch": 1.0543541531669065,
278
- "eval_bleu": 43.8509,
279
- "eval_chrf++": 62.1772,
280
- "eval_gen_len": 17.8905,
281
- "eval_loss": 0.6679165959358215,
282
- "eval_runtime": 3455.7087,
283
- "eval_samples_per_second": 2.119,
284
- "eval_steps_per_second": 1.059,
285
  "step": 3000
286
  },
287
  {
@@ -321,13 +321,13 @@
321
  },
322
  {
323
  "epoch": 1.2300798453613908,
324
- "eval_bleu": 44.0794,
325
- "eval_chrf++": 62.3925,
326
- "eval_gen_len": 17.7716,
327
- "eval_loss": 0.6606557369232178,
328
- "eval_runtime": 3446.6004,
329
- "eval_samples_per_second": 2.124,
330
- "eval_steps_per_second": 1.062,
331
  "step": 3500
332
  },
333
  {
@@ -367,13 +367,13 @@
367
  },
368
  {
369
  "epoch": 1.4058055375558753,
370
- "eval_bleu": 44.0555,
371
- "eval_chrf++": 62.3227,
372
- "eval_gen_len": 17.7384,
373
- "eval_loss": 0.6558951139450073,
374
- "eval_runtime": 3448.9332,
375
- "eval_samples_per_second": 2.123,
376
- "eval_steps_per_second": 1.061,
377
  "step": 4000
378
  },
379
  {
@@ -413,13 +413,13 @@
413
  },
414
  {
415
  "epoch": 1.5815312297503596,
416
- "eval_bleu": 43.6701,
417
- "eval_chrf++": 62.1631,
418
- "eval_gen_len": 17.6893,
419
- "eval_loss": 0.6474220156669617,
420
- "eval_runtime": 3434.705,
421
- "eval_samples_per_second": 2.131,
422
- "eval_steps_per_second": 1.066,
423
  "step": 4500
424
  },
425
  {
@@ -459,13 +459,13 @@
459
  },
460
  {
461
  "epoch": 1.757256921944844,
462
- "eval_bleu": 44.2084,
463
- "eval_chrf++": 62.587,
464
- "eval_gen_len": 17.7677,
465
- "eval_loss": 0.6390019059181213,
466
- "eval_runtime": 3462.9771,
467
- "eval_samples_per_second": 2.114,
468
- "eval_steps_per_second": 1.057,
469
  "step": 5000
470
  },
471
  {
@@ -505,205 +505,21 @@
505
  },
506
  {
507
  "epoch": 1.9329826141393285,
508
- "eval_bleu": 44.5351,
509
- "eval_chrf++": 62.758,
510
- "eval_gen_len": 17.7998,
511
- "eval_loss": 0.6315435767173767,
512
- "eval_runtime": 3432.7408,
513
- "eval_samples_per_second": 2.133,
514
- "eval_steps_per_second": 1.066,
515
  "step": 5500
516
- },
517
- {
518
- "epoch": 1.9681277525782255,
519
- "grad_norm": 0.5488588213920593,
520
- "learning_rate": 8.17531305903399e-05,
521
- "loss": 0.5845,
522
- "step": 5600
523
- },
524
- {
525
- "epoch": 2.0032728910171222,
526
- "grad_norm": 0.49017634987831116,
527
- "learning_rate": 8.139534883720931e-05,
528
- "loss": 0.59,
529
- "step": 5700
530
- },
531
- {
532
- "epoch": 2.038418029456019,
533
- "grad_norm": 0.5274912118911743,
534
- "learning_rate": 8.103756708407871e-05,
535
- "loss": 0.4667,
536
- "step": 5800
537
- },
538
- {
539
- "epoch": 2.0735631678949162,
540
- "grad_norm": 1.4353556632995605,
541
- "learning_rate": 8.067978533094812e-05,
542
- "loss": 0.4706,
543
- "step": 5900
544
- },
545
- {
546
- "epoch": 2.108708306333813,
547
- "grad_norm": 0.5296390056610107,
548
- "learning_rate": 8.032200357781753e-05,
549
- "loss": 0.4697,
550
- "step": 6000
551
- },
552
- {
553
- "epoch": 2.108708306333813,
554
- "eval_bleu": 44.1342,
555
- "eval_chrf++": 62.5401,
556
- "eval_gen_len": 17.7677,
557
- "eval_loss": 0.6582108736038208,
558
- "eval_runtime": 3425.6987,
559
- "eval_samples_per_second": 2.137,
560
- "eval_steps_per_second": 1.069,
561
- "step": 6000
562
- },
563
- {
564
- "epoch": 2.14385344477271,
565
- "grad_norm": 0.6416345238685608,
566
- "learning_rate": 7.996422182468695e-05,
567
- "loss": 0.4882,
568
- "step": 6100
569
- },
570
- {
571
- "epoch": 2.1789985832116066,
572
- "grad_norm": 0.5234227180480957,
573
- "learning_rate": 7.960644007155635e-05,
574
- "loss": 0.4835,
575
- "step": 6200
576
- },
577
- {
578
- "epoch": 2.2141437216505038,
579
- "grad_norm": 0.4757489860057831,
580
- "learning_rate": 7.924865831842576e-05,
581
- "loss": 0.4771,
582
- "step": 6300
583
- },
584
- {
585
- "epoch": 2.2492888600894005,
586
- "grad_norm": 0.5438205599784851,
587
- "learning_rate": 7.889087656529517e-05,
588
- "loss": 0.4829,
589
- "step": 6400
590
- },
591
- {
592
- "epoch": 2.2844339985282973,
593
- "grad_norm": 0.5392005443572998,
594
- "learning_rate": 7.853309481216459e-05,
595
- "loss": 0.474,
596
- "step": 6500
597
- },
598
- {
599
- "epoch": 2.2844339985282973,
600
- "eval_bleu": 44.2923,
601
- "eval_chrf++": 62.5586,
602
- "eval_gen_len": 17.7492,
603
- "eval_loss": 0.6481789350509644,
604
- "eval_runtime": 3422.6338,
605
- "eval_samples_per_second": 2.139,
606
- "eval_steps_per_second": 1.07,
607
- "step": 6500
608
- },
609
- {
610
- "epoch": 2.319579136967194,
611
- "grad_norm": 0.5046759843826294,
612
- "learning_rate": 7.8175313059034e-05,
613
- "loss": 0.4802,
614
- "step": 6600
615
- },
616
- {
617
- "epoch": 2.354724275406091,
618
- "grad_norm": 0.49111250042915344,
619
- "learning_rate": 7.78175313059034e-05,
620
- "loss": 0.4916,
621
- "step": 6700
622
- },
623
- {
624
- "epoch": 2.389869413844988,
625
- "grad_norm": 0.5712496638298035,
626
- "learning_rate": 7.745974955277281e-05,
627
- "loss": 0.4845,
628
- "step": 6800
629
- },
630
- {
631
- "epoch": 2.425014552283885,
632
- "grad_norm": 0.6314510703086853,
633
- "learning_rate": 7.710196779964223e-05,
634
- "loss": 0.484,
635
- "step": 6900
636
- },
637
- {
638
- "epoch": 2.4601596907227816,
639
- "grad_norm": 0.6166778802871704,
640
- "learning_rate": 7.674418604651163e-05,
641
- "loss": 0.488,
642
- "step": 7000
643
- },
644
- {
645
- "epoch": 2.4601596907227816,
646
- "eval_bleu": 44.7709,
647
- "eval_chrf++": 62.9298,
648
- "eval_gen_len": 17.7547,
649
- "eval_loss": 0.6452430486679077,
650
- "eval_runtime": 3438.1607,
651
- "eval_samples_per_second": 2.129,
652
- "eval_steps_per_second": 1.065,
653
- "step": 7000
654
- },
655
- {
656
- "epoch": 2.495304829161679,
657
- "grad_norm": 0.5143587589263916,
658
- "learning_rate": 7.638640429338104e-05,
659
- "loss": 0.4875,
660
- "step": 7100
661
- },
662
- {
663
- "epoch": 2.5304499676005756,
664
- "grad_norm": 0.5172815322875977,
665
- "learning_rate": 7.602862254025045e-05,
666
- "loss": 0.4805,
667
- "step": 7200
668
- },
669
- {
670
- "epoch": 2.5655951060394724,
671
- "grad_norm": 0.49376818537712097,
672
- "learning_rate": 7.567084078711986e-05,
673
- "loss": 0.488,
674
- "step": 7300
675
- },
676
- {
677
- "epoch": 2.600740244478369,
678
- "grad_norm": 0.5714296102523804,
679
- "learning_rate": 7.531305903398927e-05,
680
- "loss": 0.4893,
681
- "step": 7400
682
- },
683
- {
684
- "epoch": 2.635885382917266,
685
- "grad_norm": 0.47455132007598877,
686
- "learning_rate": 7.495527728085868e-05,
687
- "loss": 0.4767,
688
- "step": 7500
689
- },
690
- {
691
- "epoch": 2.635885382917266,
692
- "eval_bleu": 44.8961,
693
- "eval_chrf++": 63.0641,
694
- "eval_gen_len": 17.7427,
695
- "eval_loss": 0.6402289867401123,
696
- "eval_runtime": 3447.7653,
697
- "eval_samples_per_second": 2.123,
698
- "eval_steps_per_second": 1.062,
699
- "step": 7500
700
  }
701
  ],
702
  "logging_steps": 100,
703
  "max_steps": 28450,
704
  "num_input_tokens_seen": 0,
705
  "num_train_epochs": 10,
706
- "save_steps": 1500,
707
  "stateful_callbacks": {
708
  "TrainerControl": {
709
  "args": {
@@ -716,7 +532,7 @@
716
  "attributes": {}
717
  }
718
  },
719
- "total_flos": 3.268447098428719e+18,
720
  "train_batch_size": 2,
721
  "trial_name": null,
722
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.6349581480026245,
3
+ "best_model_checkpoint": "nllb_200_distilled_1.3B_ENtoFO_bsz_64_epochs_10lr0.0001/checkpoint-5500",
4
+ "epoch": 1.9329826141393285,
5
  "eval_steps": 500,
6
+ "global_step": 5500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
45
  },
46
  {
47
  "epoch": 0.17572569219448442,
48
+ "eval_bleu": 39.5852,
49
+ "eval_chrf++": 58.6112,
50
+ "eval_gen_len": 17.7742,
51
+ "eval_loss": 0.8167479634284973,
52
+ "eval_runtime": 3590.7671,
53
+ "eval_samples_per_second": 2.039,
54
+ "eval_steps_per_second": 1.02,
55
  "step": 500
56
  },
57
  {
 
91
  },
92
  {
93
  "epoch": 0.35145138438896883,
94
+ "eval_bleu": 41.4406,
95
+ "eval_chrf++": 60.2627,
96
+ "eval_gen_len": 17.9384,
97
+ "eval_loss": 0.7587867975234985,
98
+ "eval_runtime": 3432.3677,
99
+ "eval_samples_per_second": 2.133,
100
+ "eval_steps_per_second": 1.067,
101
  "step": 1000
102
  },
103
  {
 
137
  },
138
  {
139
  "epoch": 0.5271770765834533,
140
+ "eval_bleu": 41.9609,
141
+ "eval_chrf++": 60.633,
142
+ "eval_gen_len": 17.8868,
143
+ "eval_loss": 0.7227240800857544,
144
+ "eval_runtime": 3519.7091,
145
+ "eval_samples_per_second": 2.08,
146
+ "eval_steps_per_second": 1.04,
147
  "step": 1500
148
  },
149
  {
 
183
  },
184
  {
185
  "epoch": 0.7029027687779377,
186
+ "eval_bleu": 42.6374,
187
+ "eval_chrf++": 61.2613,
188
+ "eval_gen_len": 17.7972,
189
+ "eval_loss": 0.7011950612068176,
190
+ "eval_runtime": 3436.4901,
191
+ "eval_samples_per_second": 2.13,
192
+ "eval_steps_per_second": 1.065,
193
  "step": 2000
194
  },
195
  {
 
229
  },
230
  {
231
  "epoch": 0.8786284609724221,
232
+ "eval_bleu": 43.3601,
233
+ "eval_chrf++": 61.7425,
234
+ "eval_gen_len": 17.935,
235
+ "eval_loss": 0.6797980666160583,
236
+ "eval_runtime": 3463.8249,
237
+ "eval_samples_per_second": 2.114,
238
+ "eval_steps_per_second": 1.057,
239
  "step": 2500
240
  },
241
  {
 
275
  },
276
  {
277
  "epoch": 1.0543541531669065,
278
+ "eval_bleu": 43.6802,
279
+ "eval_chrf++": 62.0773,
280
+ "eval_gen_len": 17.9802,
281
+ "eval_loss": 0.6721383929252625,
282
+ "eval_runtime": 3607.5103,
283
+ "eval_samples_per_second": 2.029,
284
+ "eval_steps_per_second": 1.015,
285
  "step": 3000
286
  },
287
  {
 
321
  },
322
  {
323
  "epoch": 1.2300798453613908,
324
+ "eval_bleu": 43.9272,
325
+ "eval_chrf++": 62.2949,
326
+ "eval_gen_len": 17.8805,
327
+ "eval_loss": 0.6639961004257202,
328
+ "eval_runtime": 3511.7691,
329
+ "eval_samples_per_second": 2.085,
330
+ "eval_steps_per_second": 1.042,
331
  "step": 3500
332
  },
333
  {
 
367
  },
368
  {
369
  "epoch": 1.4058055375558753,
370
+ "eval_bleu": 43.7946,
371
+ "eval_chrf++": 62.1568,
372
+ "eval_gen_len": 17.8172,
373
+ "eval_loss": 0.6595008373260498,
374
+ "eval_runtime": 3539.6035,
375
+ "eval_samples_per_second": 2.068,
376
+ "eval_steps_per_second": 1.034,
377
  "step": 4000
378
  },
379
  {
 
413
  },
414
  {
415
  "epoch": 1.5815312297503596,
416
+ "eval_bleu": 43.8068,
417
+ "eval_chrf++": 62.1665,
418
+ "eval_gen_len": 17.8271,
419
+ "eval_loss": 0.64792400598526,
420
+ "eval_runtime": 3653.4406,
421
+ "eval_samples_per_second": 2.004,
422
+ "eval_steps_per_second": 1.002,
423
  "step": 4500
424
  },
425
  {
 
459
  },
460
  {
461
  "epoch": 1.757256921944844,
462
+ "eval_bleu": 44.0163,
463
+ "eval_chrf++": 62.4374,
464
+ "eval_gen_len": 17.8788,
465
+ "eval_loss": 0.6403423547744751,
466
+ "eval_runtime": 3440.4393,
467
+ "eval_samples_per_second": 2.128,
468
+ "eval_steps_per_second": 1.064,
469
  "step": 5000
470
  },
471
  {
 
505
  },
506
  {
507
  "epoch": 1.9329826141393285,
508
+ "eval_bleu": 44.5286,
509
+ "eval_chrf++": 62.728,
510
+ "eval_gen_len": 17.8899,
511
+ "eval_loss": 0.6349581480026245,
512
+ "eval_runtime": 3524.4692,
513
+ "eval_samples_per_second": 2.077,
514
+ "eval_steps_per_second": 1.039,
515
  "step": 5500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  }
517
  ],
518
  "logging_steps": 100,
519
  "max_steps": 28450,
520
  "num_input_tokens_seen": 0,
521
  "num_train_epochs": 10,
522
+ "save_steps": 500,
523
  "stateful_callbacks": {
524
  "TrainerControl": {
525
  "args": {
 
532
  "attributes": {}
533
  }
534
  },
535
+ "total_flos": 2.3968643831845356e+18,
536
  "train_batch_size": 2,
537
  "trial_name": null,
538
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb6596237356fc3a8b7aecd2f98c6065ef55e3bf7a37e9b82308f262d8e9d03a
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8cf5c96d2df8bf58fd891f9907effe03685c6ca5d8073324d1011c8629e1259
3
  size 5304