File size: 31,262 Bytes
98f260a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 50,
  "global_step": 478,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.02092050209205021,
      "grad_norm": 6.4084427221095295,
      "learning_rate": 1.0416666666666667e-07,
      "logits/chosen": -2.630444049835205,
      "logits/rejected": -2.576719045639038,
      "logps/chosen": -288.65911865234375,
      "logps/rejected": -275.90252685546875,
      "loss": 0.6931,
      "rewards/accuracies": 0.4312500059604645,
      "rewards/chosen": 0.00032657841802574694,
      "rewards/margins": 0.0008425033884122968,
      "rewards/rejected": -0.0005159247666597366,
      "step": 10
    },
    {
      "epoch": 0.04184100418410042,
      "grad_norm": 5.7047484713043755,
      "learning_rate": 2.0833333333333333e-07,
      "logits/chosen": -2.644824981689453,
      "logits/rejected": -2.6137185096740723,
      "logps/chosen": -293.5597839355469,
      "logps/rejected": -259.2336730957031,
      "loss": 0.6926,
      "rewards/accuracies": 0.606249988079071,
      "rewards/chosen": 0.002616675803437829,
      "rewards/margins": 0.001477306941524148,
      "rewards/rejected": 0.0011393685126677155,
      "step": 20
    },
    {
      "epoch": 0.06276150627615062,
      "grad_norm": 5.335002677652247,
      "learning_rate": 3.1249999999999997e-07,
      "logits/chosen": -2.666045665740967,
      "logits/rejected": -2.5890631675720215,
      "logps/chosen": -294.64007568359375,
      "logps/rejected": -287.18695068359375,
      "loss": 0.6901,
      "rewards/accuracies": 0.59375,
      "rewards/chosen": 0.015856895595788956,
      "rewards/margins": 0.008159220218658447,
      "rewards/rejected": 0.0076976739801466465,
      "step": 30
    },
    {
      "epoch": 0.08368200836820083,
      "grad_norm": 5.361991577232885,
      "learning_rate": 4.1666666666666667e-07,
      "logits/chosen": -2.6365890502929688,
      "logits/rejected": -2.5537407398223877,
      "logps/chosen": -270.41375732421875,
      "logps/rejected": -240.17959594726562,
      "loss": 0.6825,
      "rewards/accuracies": 0.65625,
      "rewards/chosen": 0.03482341766357422,
      "rewards/margins": 0.02616509422659874,
      "rewards/rejected": 0.008658323436975479,
      "step": 40
    },
    {
      "epoch": 0.10460251046025104,
      "grad_norm": 6.225003725839748,
      "learning_rate": 4.999733114418725e-07,
      "logits/chosen": -2.5800583362579346,
      "logits/rejected": -2.571406364440918,
      "logps/chosen": -264.10205078125,
      "logps/rejected": -246.74868774414062,
      "loss": 0.6687,
      "rewards/accuracies": 0.6937500238418579,
      "rewards/chosen": 0.013260206207633018,
      "rewards/margins": 0.07922474294900894,
      "rewards/rejected": -0.06596453487873077,
      "step": 50
    },
    {
      "epoch": 0.10460251046025104,
      "eval_logits/chosen": -2.615793228149414,
      "eval_logits/rejected": -2.5782690048217773,
      "eval_logps/chosen": -260.0484313964844,
      "eval_logps/rejected": -271.3634033203125,
      "eval_loss": 0.649348258972168,
      "eval_rewards/accuracies": 0.70703125,
      "eval_rewards/chosen": 0.025814848020672798,
      "eval_rewards/margins": 0.11282772570848465,
      "eval_rewards/rejected": -0.0870128720998764,
      "eval_runtime": 103.335,
      "eval_samples_per_second": 19.355,
      "eval_steps_per_second": 0.31,
      "step": 50
    },
    {
      "epoch": 0.12552301255230125,
      "grad_norm": 9.898033994957384,
      "learning_rate": 4.990398100856366e-07,
      "logits/chosen": -2.543462038040161,
      "logits/rejected": -2.50410795211792,
      "logps/chosen": -268.82574462890625,
      "logps/rejected": -262.0675964355469,
      "loss": 0.6502,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.010042434558272362,
      "rewards/margins": 0.14874781668186188,
      "rewards/rejected": -0.15879027545452118,
      "step": 60
    },
    {
      "epoch": 0.14644351464435146,
      "grad_norm": 9.743629526270734,
      "learning_rate": 4.967775735898179e-07,
      "logits/chosen": -2.6071839332580566,
      "logits/rejected": -2.5465664863586426,
      "logps/chosen": -304.88092041015625,
      "logps/rejected": -310.2099609375,
      "loss": 0.6315,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": -0.198007270693779,
      "rewards/margins": 0.2037646323442459,
      "rewards/rejected": -0.4017719328403473,
      "step": 70
    },
    {
      "epoch": 0.16736401673640167,
      "grad_norm": 15.248138814448389,
      "learning_rate": 4.931986719649298e-07,
      "logits/chosen": -1.8189195394515991,
      "logits/rejected": -1.8181097507476807,
      "logps/chosen": -303.1712951660156,
      "logps/rejected": -337.1569519042969,
      "loss": 0.6003,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": -0.4402276873588562,
      "rewards/margins": 0.30723443627357483,
      "rewards/rejected": -0.7474621534347534,
      "step": 80
    },
    {
      "epoch": 0.18828451882845187,
      "grad_norm": 14.823998474977419,
      "learning_rate": 4.883222001996351e-07,
      "logits/chosen": -0.9793803095817566,
      "logits/rejected": -0.8350278735160828,
      "logps/chosen": -324.31463623046875,
      "logps/rejected": -369.8716735839844,
      "loss": 0.5849,
      "rewards/accuracies": 0.65625,
      "rewards/chosen": -0.5321702361106873,
      "rewards/margins": 0.5584505796432495,
      "rewards/rejected": -1.090620756149292,
      "step": 90
    },
    {
      "epoch": 0.20920502092050208,
      "grad_norm": 22.816637306761073,
      "learning_rate": 4.821741763807186e-07,
      "logits/chosen": -0.4131926894187927,
      "logits/rejected": 0.20026779174804688,
      "logps/chosen": -381.84002685546875,
      "logps/rejected": -387.60943603515625,
      "loss": 0.5614,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.8785870671272278,
      "rewards/margins": 0.7347825765609741,
      "rewards/rejected": -1.6133695840835571,
      "step": 100
    },
    {
      "epoch": 0.20920502092050208,
      "eval_logits/chosen": -0.5290488004684448,
      "eval_logits/rejected": -0.22265684604644775,
      "eval_logps/chosen": -343.2120361328125,
      "eval_logps/rejected": -421.86474609375,
      "eval_loss": 0.5806925892829895,
      "eval_rewards/accuracies": 0.7109375,
      "eval_rewards/chosen": -0.8058211803436279,
      "eval_rewards/margins": 0.7862052321434021,
      "eval_rewards/rejected": -1.5920264720916748,
      "eval_runtime": 104.2044,
      "eval_samples_per_second": 19.193,
      "eval_steps_per_second": 0.307,
      "step": 100
    },
    {
      "epoch": 0.2301255230125523,
      "grad_norm": 19.02164952454875,
      "learning_rate": 4.747874028753375e-07,
      "logits/chosen": -0.43088096380233765,
      "logits/rejected": -0.01379423774778843,
      "logps/chosen": -391.91552734375,
      "logps/rejected": -446.66961669921875,
      "loss": 0.5589,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -0.9717921018600464,
      "rewards/margins": 0.7386372089385986,
      "rewards/rejected": -1.7104294300079346,
      "step": 110
    },
    {
      "epoch": 0.2510460251046025,
      "grad_norm": 23.262987016748436,
      "learning_rate": 4.662012913161997e-07,
      "logits/chosen": 0.030251333490014076,
      "logits/rejected": 0.5855879187583923,
      "logps/chosen": -378.17999267578125,
      "logps/rejected": -441.85406494140625,
      "loss": 0.5556,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -1.0160324573516846,
      "rewards/margins": 1.018049955368042,
      "rewards/rejected": -2.0340826511383057,
      "step": 120
    },
    {
      "epoch": 0.2719665271966527,
      "grad_norm": 21.032912122052533,
      "learning_rate": 4.5646165232345103e-07,
      "logits/chosen": -0.32387033104896545,
      "logits/rejected": 0.246691033244133,
      "logps/chosen": -401.41961669921875,
      "logps/rejected": -456.65216064453125,
      "loss": 0.542,
      "rewards/accuracies": 0.6937500238418579,
      "rewards/chosen": -1.039526343345642,
      "rewards/margins": 0.8820648193359375,
      "rewards/rejected": -1.9215911626815796,
      "step": 130
    },
    {
      "epoch": 0.2928870292887029,
      "grad_norm": 18.49329811567727,
      "learning_rate": 4.456204510851956e-07,
      "logits/chosen": -0.7103713154792786,
      "logits/rejected": -0.07352075725793839,
      "logps/chosen": -398.73785400390625,
      "logps/rejected": -456.5108337402344,
      "loss": 0.5449,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.9843541979789734,
      "rewards/margins": 0.8879961967468262,
      "rewards/rejected": -1.8723503351211548,
      "step": 140
    },
    {
      "epoch": 0.3138075313807531,
      "grad_norm": 22.991798597203633,
      "learning_rate": 4.337355301007335e-07,
      "logits/chosen": -0.8071734309196472,
      "logits/rejected": 0.08783279359340668,
      "logps/chosen": -380.7442321777344,
      "logps/rejected": -434.64959716796875,
      "loss": 0.5419,
      "rewards/accuracies": 0.6937500238418579,
      "rewards/chosen": -0.8969854116439819,
      "rewards/margins": 0.7560001611709595,
      "rewards/rejected": -1.6529855728149414,
      "step": 150
    },
    {
      "epoch": 0.3138075313807531,
      "eval_logits/chosen": -0.0013768002390861511,
      "eval_logits/rejected": 0.6415377259254456,
      "eval_logps/chosen": -367.39569091796875,
      "eval_logps/rejected": -469.2164611816406,
      "eval_loss": 0.5584754347801208,
      "eval_rewards/accuracies": 0.74609375,
      "eval_rewards/chosen": -1.0476573705673218,
      "eval_rewards/margins": 1.0178861618041992,
      "eval_rewards/rejected": -2.0655436515808105,
      "eval_runtime": 102.1901,
      "eval_samples_per_second": 19.571,
      "eval_steps_per_second": 0.313,
      "step": 150
    },
    {
      "epoch": 0.33472803347280333,
      "grad_norm": 17.28842262569855,
      "learning_rate": 4.2087030056579986e-07,
      "logits/chosen": 0.42611104249954224,
      "logits/rejected": 1.240443468093872,
      "logps/chosen": -371.35699462890625,
      "logps/rejected": -441.8954162597656,
      "loss": 0.5371,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.0083401203155518,
      "rewards/margins": 1.0140050649642944,
      "rewards/rejected": -2.0223450660705566,
      "step": 160
    },
    {
      "epoch": 0.35564853556485354,
      "grad_norm": 23.711859022455013,
      "learning_rate": 4.070934040463998e-07,
      "logits/chosen": 0.9712627530097961,
      "logits/rejected": 1.8879001140594482,
      "logps/chosen": -393.61029052734375,
      "logps/rejected": -474.1710510253906,
      "loss": 0.5201,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.2388116121292114,
      "rewards/margins": 0.9170275926589966,
      "rewards/rejected": -2.155838966369629,
      "step": 170
    },
    {
      "epoch": 0.37656903765690375,
      "grad_norm": 20.4986556938231,
      "learning_rate": 3.9247834624635404e-07,
      "logits/chosen": 1.3909448385238647,
      "logits/rejected": 2.293593406677246,
      "logps/chosen": -461.04046630859375,
      "logps/rejected": -531.3743286132812,
      "loss": 0.534,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -1.5673408508300781,
      "rewards/margins": 1.0537548065185547,
      "rewards/rejected": -2.621096134185791,
      "step": 180
    },
    {
      "epoch": 0.39748953974895396,
      "grad_norm": 29.28443173248426,
      "learning_rate": 3.7710310482256523e-07,
      "logits/chosen": 0.8631278276443481,
      "logits/rejected": 1.6216942071914673,
      "logps/chosen": -393.5219421386719,
      "logps/rejected": -465.5166015625,
      "loss": 0.5181,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.397423505783081,
      "rewards/margins": 0.8221076726913452,
      "rewards/rejected": -2.2195310592651367,
      "step": 190
    },
    {
      "epoch": 0.41841004184100417,
      "grad_norm": 24.61839368474693,
      "learning_rate": 3.610497133404795e-07,
      "logits/chosen": 1.1675374507904053,
      "logits/rejected": 2.012094020843506,
      "logps/chosen": -408.1467590332031,
      "logps/rejected": -462.46875,
      "loss": 0.526,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.4781768321990967,
      "rewards/margins": 0.8430646657943726,
      "rewards/rejected": -2.3212413787841797,
      "step": 200
    },
    {
      "epoch": 0.41841004184100417,
      "eval_logits/chosen": 0.9801958799362183,
      "eval_logits/rejected": 1.7427237033843994,
      "eval_logps/chosen": -402.51995849609375,
      "eval_logps/rejected": -517.015625,
      "eval_loss": 0.5562114715576172,
      "eval_rewards/accuracies": 0.76171875,
      "eval_rewards/chosen": -1.3989005088806152,
      "eval_rewards/margins": 1.1446349620819092,
      "eval_rewards/rejected": -2.5435354709625244,
      "eval_runtime": 106.3821,
      "eval_samples_per_second": 18.8,
      "eval_steps_per_second": 0.301,
      "step": 200
    },
    {
      "epoch": 0.4393305439330544,
      "grad_norm": 24.016781344152104,
      "learning_rate": 3.4440382358952115e-07,
      "logits/chosen": 1.11398446559906,
      "logits/rejected": 2.043394088745117,
      "logps/chosen": -427.09759521484375,
      "logps/rejected": -499.5668029785156,
      "loss": 0.5271,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.5520648956298828,
      "rewards/margins": 0.8630266189575195,
      "rewards/rejected": -2.4150915145874023,
      "step": 210
    },
    {
      "epoch": 0.4602510460251046,
      "grad_norm": 19.623224068729442,
      "learning_rate": 3.272542485937368e-07,
      "logits/chosen": 0.09422092139720917,
      "logits/rejected": 0.9946798086166382,
      "logps/chosen": -387.312255859375,
      "logps/rejected": -478.73699951171875,
      "loss": 0.5347,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -1.0305876731872559,
      "rewards/margins": 1.044886827468872,
      "rewards/rejected": -2.075474262237549,
      "step": 220
    },
    {
      "epoch": 0.4811715481171548,
      "grad_norm": 20.279694061078374,
      "learning_rate": 3.096924887558854e-07,
      "logits/chosen": 0.5403207540512085,
      "logits/rejected": 1.6011781692504883,
      "logps/chosen": -414.5859375,
      "logps/rejected": -522.0372314453125,
      "loss": 0.4963,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -1.325188398361206,
      "rewards/margins": 1.3052270412445068,
      "rewards/rejected": -2.630415439605713,
      "step": 230
    },
    {
      "epoch": 0.502092050209205,
      "grad_norm": 17.600900176176182,
      "learning_rate": 2.9181224366319943e-07,
      "logits/chosen": 1.2316529750823975,
      "logits/rejected": 2.2336225509643555,
      "logps/chosen": -405.16815185546875,
      "logps/rejected": -461.0508728027344,
      "loss": 0.5208,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.505174994468689,
      "rewards/margins": 0.9252998232841492,
      "rewards/rejected": -2.4304747581481934,
      "step": 240
    },
    {
      "epoch": 0.5230125523012552,
      "grad_norm": 21.388657517716165,
      "learning_rate": 2.7370891215954565e-07,
      "logits/chosen": 0.33056747913360596,
      "logits/rejected": 1.398674726486206,
      "logps/chosen": -425.12738037109375,
      "logps/rejected": -477.9986267089844,
      "loss": 0.5202,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -1.3082549571990967,
      "rewards/margins": 1.007277011871338,
      "rewards/rejected": -2.3155319690704346,
      "step": 250
    },
    {
      "epoch": 0.5230125523012552,
      "eval_logits/chosen": 0.6489181518554688,
      "eval_logits/rejected": 1.4379549026489258,
      "eval_logps/chosen": -376.8782653808594,
      "eval_logps/rejected": -495.4537353515625,
      "eval_loss": 0.5419028401374817,
      "eval_rewards/accuracies": 0.7890625,
      "eval_rewards/chosen": -1.142483115196228,
      "eval_rewards/margins": 1.185433030128479,
      "eval_rewards/rejected": -2.327916383743286,
      "eval_runtime": 104.2974,
      "eval_samples_per_second": 19.176,
      "eval_steps_per_second": 0.307,
      "step": 250
    },
    {
      "epoch": 0.5439330543933054,
      "grad_norm": 23.92722986686011,
      "learning_rate": 2.55479083351317e-07,
      "logits/chosen": 1.0843138694763184,
      "logits/rejected": 2.0778467655181885,
      "logps/chosen": -430.15911865234375,
      "logps/rejected": -504.9864807128906,
      "loss": 0.5125,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -1.1564133167266846,
      "rewards/margins": 1.2437217235565186,
      "rewards/rejected": -2.4001352787017822,
      "step": 260
    },
    {
      "epoch": 0.5648535564853556,
      "grad_norm": 21.06560694930431,
      "learning_rate": 2.3722002126275822e-07,
      "logits/chosen": 1.3407487869262695,
      "logits/rejected": 2.4112818241119385,
      "logps/chosen": -432.30023193359375,
      "logps/rejected": -504.80670166015625,
      "loss": 0.5096,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -1.3888903856277466,
      "rewards/margins": 1.141367793083191,
      "rewards/rejected": -2.5302579402923584,
      "step": 270
    },
    {
      "epoch": 0.5857740585774058,
      "grad_norm": 21.8560737042057,
      "learning_rate": 2.19029145890313e-07,
      "logits/chosen": 1.2237210273742676,
      "logits/rejected": 2.0341880321502686,
      "logps/chosen": -389.9634704589844,
      "logps/rejected": -492.298828125,
      "loss": 0.5114,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.3260904550552368,
      "rewards/margins": 1.0742130279541016,
      "rewards/rejected": -2.400303363800049,
      "step": 280
    },
    {
      "epoch": 0.606694560669456,
      "grad_norm": 23.993372955897243,
      "learning_rate": 2.0100351342479216e-07,
      "logits/chosen": 1.5786670446395874,
      "logits/rejected": 2.3115456104278564,
      "logps/chosen": -447.48382568359375,
      "logps/rejected": -546.939697265625,
      "loss": 0.513,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -1.5614277124404907,
      "rewards/margins": 1.1185188293457031,
      "rewards/rejected": -2.6799466609954834,
      "step": 290
    },
    {
      "epoch": 0.6276150627615062,
      "grad_norm": 22.907667503889392,
      "learning_rate": 1.8323929841460178e-07,
      "logits/chosen": 2.0128085613250732,
      "logits/rejected": 2.6608686447143555,
      "logps/chosen": -434.07830810546875,
      "logps/rejected": -551.2879028320312,
      "loss": 0.5054,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.6475236415863037,
      "rewards/margins": 1.0819259881973267,
      "rewards/rejected": -2.72944974899292,
      "step": 300
    },
    {
      "epoch": 0.6276150627615062,
      "eval_logits/chosen": 1.477131962776184,
      "eval_logits/rejected": 2.255997657775879,
      "eval_logps/chosen": -402.4423522949219,
      "eval_logps/rejected": -531.4893798828125,
      "eval_loss": 0.5449927449226379,
      "eval_rewards/accuracies": 0.77734375,
      "eval_rewards/chosen": -1.3981244564056396,
      "eval_rewards/margins": 1.290148377418518,
      "eval_rewards/rejected": -2.688272714614868,
      "eval_runtime": 103.6153,
      "eval_samples_per_second": 19.302,
      "eval_steps_per_second": 0.309,
      "step": 300
    },
    {
      "epoch": 0.6485355648535565,
      "grad_norm": 20.97030138169365,
      "learning_rate": 1.6583128063291573e-07,
      "logits/chosen": 1.2844688892364502,
      "logits/rejected": 2.06947660446167,
      "logps/chosen": -437.86358642578125,
      "logps/rejected": -538.4571533203125,
      "loss": 0.5145,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": -1.4390218257904053,
      "rewards/margins": 1.1747257709503174,
      "rewards/rejected": -2.6137475967407227,
      "step": 310
    },
    {
      "epoch": 0.6694560669456067,
      "grad_norm": 23.361486108617427,
      "learning_rate": 1.488723393865766e-07,
      "logits/chosen": 1.0614537000656128,
      "logits/rejected": 2.0729637145996094,
      "logps/chosen": -453.379150390625,
      "logps/rejected": -521.8544311523438,
      "loss": 0.4923,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.6114938259124756,
      "rewards/margins": 1.0873934030532837,
      "rewards/rejected": -2.6988871097564697,
      "step": 320
    },
    {
      "epoch": 0.6903765690376569,
      "grad_norm": 24.156654265698716,
      "learning_rate": 1.3245295796480788e-07,
      "logits/chosen": 1.4917339086532593,
      "logits/rejected": 2.5346505641937256,
      "logps/chosen": -463.26202392578125,
      "logps/rejected": -520.6095581054688,
      "loss": 0.5059,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.7963454723358154,
      "rewards/margins": 0.9912670254707336,
      "rewards/rejected": -2.7876124382019043,
      "step": 330
    },
    {
      "epoch": 0.7112970711297071,
      "grad_norm": 19.422370738594758,
      "learning_rate": 1.1666074087171627e-07,
      "logits/chosen": 1.7316112518310547,
      "logits/rejected": 2.561483144760132,
      "logps/chosen": -413.0133361816406,
      "logps/rejected": -536.4874877929688,
      "loss": 0.5184,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -1.6657421588897705,
      "rewards/margins": 1.103548288345337,
      "rewards/rejected": -2.7692904472351074,
      "step": 340
    },
    {
      "epoch": 0.7322175732217573,
      "grad_norm": 19.236548541895125,
      "learning_rate": 1.0157994641835734e-07,
      "logits/chosen": 1.52297842502594,
      "logits/rejected": 2.436295986175537,
      "logps/chosen": -450.27239990234375,
      "logps/rejected": -541.5868530273438,
      "loss": 0.497,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -1.680456519126892,
      "rewards/margins": 1.0492502450942993,
      "rewards/rejected": -2.7297065258026123,
      "step": 350
    },
    {
      "epoch": 0.7322175732217573,
      "eval_logits/chosen": 1.3703731298446655,
      "eval_logits/rejected": 2.225933074951172,
      "eval_logps/chosen": -422.6754455566406,
      "eval_logps/rejected": -549.4119873046875,
      "eval_loss": 0.5301805138587952,
      "eval_rewards/accuracies": 0.7734375,
      "eval_rewards/chosen": -1.6004550457000732,
      "eval_rewards/margins": 1.2670434713363647,
      "eval_rewards/rejected": -2.8674986362457275,
      "eval_runtime": 102.8183,
      "eval_samples_per_second": 19.452,
      "eval_steps_per_second": 0.311,
      "step": 350
    },
    {
      "epoch": 0.7531380753138075,
      "grad_norm": 19.689504951232507,
      "learning_rate": 8.729103716819111e-08,
      "logits/chosen": 1.695031762123108,
      "logits/rejected": 2.4951744079589844,
      "logps/chosen": -429.60675048828125,
      "logps/rejected": -540.1343383789062,
      "loss": 0.5127,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -1.8274829387664795,
      "rewards/margins": 1.1173667907714844,
      "rewards/rejected": -2.9448494911193848,
      "step": 360
    },
    {
      "epoch": 0.7740585774058577,
      "grad_norm": 23.998324017815545,
      "learning_rate": 7.387025063449081e-08,
      "logits/chosen": 1.3235762119293213,
      "logits/rejected": 2.6002328395843506,
      "logps/chosen": -454.5791015625,
      "logps/rejected": -572.55078125,
      "loss": 0.495,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.667346715927124,
      "rewards/margins": 1.3339135646820068,
      "rewards/rejected": -3.001260280609131,
      "step": 370
    },
    {
      "epoch": 0.7949790794979079,
      "grad_norm": 20.911379608221466,
      "learning_rate": 6.138919252022435e-08,
      "logits/chosen": 1.435396432876587,
      "logits/rejected": 2.2644729614257812,
      "logps/chosen": -481.06378173828125,
      "logps/rejected": -542.4463500976562,
      "loss": 0.5119,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.6459989547729492,
      "rewards/margins": 0.9545661211013794,
      "rewards/rejected": -2.600564956665039,
      "step": 380
    },
    {
      "epoch": 0.8158995815899581,
      "grad_norm": 33.66392923940902,
      "learning_rate": 4.991445467064689e-08,
      "logits/chosen": 1.5889087915420532,
      "logits/rejected": 2.5967953205108643,
      "logps/chosen": -469.99884033203125,
      "logps/rejected": -584.1378784179688,
      "loss": 0.5084,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -1.6343872547149658,
      "rewards/margins": 1.305241346359253,
      "rewards/rejected": -2.9396286010742188,
      "step": 390
    },
    {
      "epoch": 0.8368200836820083,
      "grad_norm": 21.820601306046125,
      "learning_rate": 3.9507259776993954e-08,
      "logits/chosen": 1.4544366598129272,
      "logits/rejected": 2.2073497772216797,
      "logps/chosen": -470.62188720703125,
      "logps/rejected": -558.2474365234375,
      "loss": 0.5076,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.766871452331543,
      "rewards/margins": 1.029807209968567,
      "rewards/rejected": -2.796678304672241,
      "step": 400
    },
    {
      "epoch": 0.8368200836820083,
      "eval_logits/chosen": 1.3332302570343018,
      "eval_logits/rejected": 2.278512477874756,
      "eval_logps/chosen": -423.95953369140625,
      "eval_logps/rejected": -558.9130859375,
      "eval_loss": 0.5348207950592041,
      "eval_rewards/accuracies": 0.7890625,
      "eval_rewards/chosen": -1.6132957935333252,
      "eval_rewards/margins": 1.3492140769958496,
      "eval_rewards/rejected": -2.962510108947754,
      "eval_runtime": 105.1303,
      "eval_samples_per_second": 19.024,
      "eval_steps_per_second": 0.304,
      "step": 400
    },
    {
      "epoch": 0.8577405857740585,
      "grad_norm": 23.0331791932855,
      "learning_rate": 3.022313472693447e-08,
      "logits/chosen": 1.0064998865127563,
      "logits/rejected": 1.9991016387939453,
      "logps/chosen": -479.11846923828125,
      "logps/rejected": -583.687255859375,
      "loss": 0.4977,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": -1.6211599111557007,
      "rewards/margins": 1.2330563068389893,
      "rewards/rejected": -2.8542163372039795,
      "step": 410
    },
    {
      "epoch": 0.8786610878661087,
      "grad_norm": 25.09829796115416,
      "learning_rate": 2.2111614344599684e-08,
      "logits/chosen": 1.3670974969863892,
      "logits/rejected": 2.6799685955047607,
      "logps/chosen": -465.97674560546875,
      "logps/rejected": -562.8436279296875,
      "loss": 0.4821,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.6978868246078491,
      "rewards/margins": 1.2206120491027832,
      "rewards/rejected": -2.9184985160827637,
      "step": 420
    },
    {
      "epoch": 0.899581589958159,
      "grad_norm": 21.301017285621448,
      "learning_rate": 1.521597710086439e-08,
      "logits/chosen": 1.232881784439087,
      "logits/rejected": 2.3901188373565674,
      "logps/chosen": -466.0107421875,
      "logps/rejected": -540.5730590820312,
      "loss": 0.5114,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": -1.70029616355896,
      "rewards/margins": 1.1120824813842773,
      "rewards/rejected": -2.8123791217803955,
      "step": 430
    },
    {
      "epoch": 0.9205020920502092,
      "grad_norm": 17.114528155184686,
      "learning_rate": 9.57301420397924e-09,
      "logits/chosen": 1.5404767990112305,
      "logits/rejected": 2.9349493980407715,
      "logps/chosen": -467.8946228027344,
      "logps/rejected": -551.1821899414062,
      "loss": 0.5065,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.856208086013794,
      "rewards/margins": 1.1952415704727173,
      "rewards/rejected": -3.051449775695801,
      "step": 440
    },
    {
      "epoch": 0.9414225941422594,
      "grad_norm": 19.50832169815014,
      "learning_rate": 5.212833302556258e-09,
      "logits/chosen": 1.3590748310089111,
      "logits/rejected": 2.3424363136291504,
      "logps/chosen": -451.4662170410156,
      "logps/rejected": -575.4518432617188,
      "loss": 0.5092,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -1.667201042175293,
      "rewards/margins": 1.312021017074585,
      "rewards/rejected": -2.979222059249878,
      "step": 450
    },
    {
      "epoch": 0.9414225941422594,
      "eval_logits/chosen": 1.3857550621032715,
      "eval_logits/rejected": 2.3444478511810303,
      "eval_logps/chosen": -429.63800048828125,
      "eval_logps/rejected": -565.6296997070312,
      "eval_loss": 0.5340853333473206,
      "eval_rewards/accuracies": 0.78515625,
      "eval_rewards/chosen": -1.6700804233551025,
      "eval_rewards/margins": 1.3595958948135376,
      "eval_rewards/rejected": -3.0296761989593506,
      "eval_runtime": 102.8547,
      "eval_samples_per_second": 19.445,
      "eval_steps_per_second": 0.311,
      "step": 450
    },
    {
      "epoch": 0.9623430962343096,
      "grad_norm": 18.50816041515539,
      "learning_rate": 2.158697848236607e-09,
      "logits/chosen": 1.729901909828186,
      "logits/rejected": 2.7589685916900635,
      "logps/chosen": -457.56207275390625,
      "logps/rejected": -565.1722412109375,
      "loss": 0.4978,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": -1.942845106124878,
      "rewards/margins": 1.1519941091537476,
      "rewards/rejected": -3.094839334487915,
      "step": 460
    },
    {
      "epoch": 0.9832635983263598,
      "grad_norm": 18.70014449283038,
      "learning_rate": 4.269029751107489e-10,
      "logits/chosen": 1.6632955074310303,
      "logits/rejected": 2.187119960784912,
      "logps/chosen": -438.1412048339844,
      "logps/rejected": -556.7476806640625,
      "loss": 0.5241,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": -1.7501564025878906,
      "rewards/margins": 1.1538572311401367,
      "rewards/rejected": -2.9040138721466064,
      "step": 470
    },
    {
      "epoch": 1.0,
      "step": 478,
      "total_flos": 0.0,
      "train_loss": 0.5431942655451627,
      "train_runtime": 12655.0807,
      "train_samples_per_second": 4.831,
      "train_steps_per_second": 0.038
    }
  ],
  "logging_steps": 10,
  "max_steps": 478,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}