zephyr-7b-sft-math_code / trainer_state.json
dlibf's picture
Model save
ffb2306 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1134,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.7543859649122808e-07,
"loss": 1.1293,
"step": 1
},
{
"epoch": 0.0,
"learning_rate": 8.771929824561404e-07,
"loss": 1.1128,
"step": 5
},
{
"epoch": 0.01,
"learning_rate": 1.7543859649122807e-06,
"loss": 1.05,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 2.631578947368421e-06,
"loss": 1.0162,
"step": 15
},
{
"epoch": 0.02,
"learning_rate": 3.5087719298245615e-06,
"loss": 0.9927,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 4.385964912280702e-06,
"loss": 0.9693,
"step": 25
},
{
"epoch": 0.03,
"learning_rate": 5.263157894736842e-06,
"loss": 0.9767,
"step": 30
},
{
"epoch": 0.03,
"learning_rate": 6.140350877192983e-06,
"loss": 0.9709,
"step": 35
},
{
"epoch": 0.04,
"learning_rate": 7.017543859649123e-06,
"loss": 0.9477,
"step": 40
},
{
"epoch": 0.04,
"learning_rate": 7.894736842105265e-06,
"loss": 0.9612,
"step": 45
},
{
"epoch": 0.04,
"learning_rate": 8.771929824561405e-06,
"loss": 0.9665,
"step": 50
},
{
"epoch": 0.05,
"learning_rate": 9.649122807017545e-06,
"loss": 0.9478,
"step": 55
},
{
"epoch": 0.05,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.962,
"step": 60
},
{
"epoch": 0.06,
"learning_rate": 1.1403508771929826e-05,
"loss": 0.941,
"step": 65
},
{
"epoch": 0.06,
"learning_rate": 1.2280701754385966e-05,
"loss": 0.9651,
"step": 70
},
{
"epoch": 0.07,
"learning_rate": 1.3157894736842108e-05,
"loss": 0.9546,
"step": 75
},
{
"epoch": 0.07,
"learning_rate": 1.4035087719298246e-05,
"loss": 0.9602,
"step": 80
},
{
"epoch": 0.07,
"learning_rate": 1.4912280701754388e-05,
"loss": 0.9564,
"step": 85
},
{
"epoch": 0.08,
"learning_rate": 1.578947368421053e-05,
"loss": 0.9636,
"step": 90
},
{
"epoch": 0.08,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.9915,
"step": 95
},
{
"epoch": 0.09,
"learning_rate": 1.754385964912281e-05,
"loss": 0.9807,
"step": 100
},
{
"epoch": 0.09,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.9863,
"step": 105
},
{
"epoch": 0.1,
"learning_rate": 1.929824561403509e-05,
"loss": 1.0003,
"step": 110
},
{
"epoch": 0.1,
"learning_rate": 1.9999952568259327e-05,
"loss": 0.9932,
"step": 115
},
{
"epoch": 0.11,
"learning_rate": 1.9998292504580528e-05,
"loss": 0.9928,
"step": 120
},
{
"epoch": 0.11,
"learning_rate": 1.9994261303802982e-05,
"loss": 0.9956,
"step": 125
},
{
"epoch": 0.11,
"learning_rate": 1.99878599219429e-05,
"loss": 0.9901,
"step": 130
},
{
"epoch": 0.12,
"learning_rate": 1.9979089877114905e-05,
"loss": 1.001,
"step": 135
},
{
"epoch": 0.12,
"learning_rate": 1.996795324917199e-05,
"loss": 0.9797,
"step": 140
},
{
"epoch": 0.13,
"learning_rate": 1.9954452679212297e-05,
"loss": 0.9936,
"step": 145
},
{
"epoch": 0.13,
"learning_rate": 1.993859136895274e-05,
"loss": 1.0019,
"step": 150
},
{
"epoch": 0.14,
"learning_rate": 1.9920373079969725e-05,
"loss": 0.9875,
"step": 155
},
{
"epoch": 0.14,
"learning_rate": 1.9899802132807073e-05,
"loss": 0.9972,
"step": 160
},
{
"epoch": 0.15,
"learning_rate": 1.9876883405951378e-05,
"loss": 0.9897,
"step": 165
},
{
"epoch": 0.15,
"learning_rate": 1.9851622334675065e-05,
"loss": 1.0089,
"step": 170
},
{
"epoch": 0.15,
"learning_rate": 1.9824024909747383e-05,
"loss": 0.9974,
"step": 175
},
{
"epoch": 0.16,
"learning_rate": 1.979409767601366e-05,
"loss": 0.9756,
"step": 180
},
{
"epoch": 0.16,
"learning_rate": 1.9761847730843195e-05,
"loss": 0.9808,
"step": 185
},
{
"epoch": 0.17,
"learning_rate": 1.972728272244605e-05,
"loss": 0.9849,
"step": 190
},
{
"epoch": 0.17,
"learning_rate": 1.9690410848059278e-05,
"loss": 0.9978,
"step": 195
},
{
"epoch": 0.18,
"learning_rate": 1.965124085200289e-05,
"loss": 0.9947,
"step": 200
},
{
"epoch": 0.18,
"learning_rate": 1.9609782023606124e-05,
"loss": 0.9907,
"step": 205
},
{
"epoch": 0.19,
"learning_rate": 1.956604419500441e-05,
"loss": 0.9924,
"step": 210
},
{
"epoch": 0.19,
"learning_rate": 1.9520037738807674e-05,
"loss": 0.9869,
"step": 215
},
{
"epoch": 0.19,
"learning_rate": 1.9471773565640405e-05,
"loss": 0.9895,
"step": 220
},
{
"epoch": 0.2,
"learning_rate": 1.9421263121554163e-05,
"loss": 0.9917,
"step": 225
},
{
"epoch": 0.2,
"learning_rate": 1.9368518385313108e-05,
"loss": 0.9919,
"step": 230
},
{
"epoch": 0.21,
"learning_rate": 1.9313551865553164e-05,
"loss": 1.0015,
"step": 235
},
{
"epoch": 0.21,
"learning_rate": 1.9256376597815565e-05,
"loss": 0.9698,
"step": 240
},
{
"epoch": 0.22,
"learning_rate": 1.919700614145541e-05,
"loss": 0.9783,
"step": 245
},
{
"epoch": 0.22,
"learning_rate": 1.913545457642601e-05,
"loss": 0.9932,
"step": 250
},
{
"epoch": 0.22,
"learning_rate": 1.9071736499939765e-05,
"loss": 0.988,
"step": 255
},
{
"epoch": 0.23,
"learning_rate": 1.9005867023006374e-05,
"loss": 0.9871,
"step": 260
},
{
"epoch": 0.23,
"learning_rate": 1.89378617668492e-05,
"loss": 0.9673,
"step": 265
},
{
"epoch": 0.24,
"learning_rate": 1.886773685920062e-05,
"loss": 0.9999,
"step": 270
},
{
"epoch": 0.24,
"learning_rate": 1.879550893047728e-05,
"loss": 1.0106,
"step": 275
},
{
"epoch": 0.25,
"learning_rate": 1.872119510983611e-05,
"loss": 0.9705,
"step": 280
},
{
"epoch": 0.25,
"learning_rate": 1.864481302111208e-05,
"loss": 0.9844,
"step": 285
},
{
"epoch": 0.26,
"learning_rate": 1.856638077863863e-05,
"loss": 1.0078,
"step": 290
},
{
"epoch": 0.26,
"learning_rate": 1.8485916982951777e-05,
"loss": 0.9838,
"step": 295
},
{
"epoch": 0.26,
"learning_rate": 1.840344071637893e-05,
"loss": 1.0826,
"step": 300
},
{
"epoch": 0.27,
"learning_rate": 1.831897153851342e-05,
"loss": 1.2311,
"step": 305
},
{
"epoch": 0.27,
"learning_rate": 1.8232529481575874e-05,
"loss": 5.7978,
"step": 310
},
{
"epoch": 0.28,
"learning_rate": 1.8144135045663486e-05,
"loss": 4.4919,
"step": 315
},
{
"epoch": 0.28,
"learning_rate": 1.8053809193888326e-05,
"loss": 4.4859,
"step": 320
},
{
"epoch": 0.29,
"learning_rate": 1.7961573347405864e-05,
"loss": 1.8311,
"step": 325
},
{
"epoch": 0.29,
"learning_rate": 1.7867449380334834e-05,
"loss": 1.4734,
"step": 330
},
{
"epoch": 0.3,
"learning_rate": 1.777145961456971e-05,
"loss": 1.2965,
"step": 335
},
{
"epoch": 0.3,
"learning_rate": 1.767362681448697e-05,
"loss": 1.1761,
"step": 340
},
{
"epoch": 0.3,
"learning_rate": 1.757397418154643e-05,
"loss": 1.1421,
"step": 345
},
{
"epoch": 0.31,
"learning_rate": 1.747252534878891e-05,
"loss": 1.1046,
"step": 350
},
{
"epoch": 0.31,
"learning_rate": 1.736930437523158e-05,
"loss": 1.0928,
"step": 355
},
{
"epoch": 0.32,
"learning_rate": 1.7264335740162244e-05,
"loss": 1.0886,
"step": 360
},
{
"epoch": 0.32,
"learning_rate": 1.7157644337333975e-05,
"loss": 1.066,
"step": 365
},
{
"epoch": 0.33,
"learning_rate": 1.7049255469061476e-05,
"loss": 1.0555,
"step": 370
},
{
"epoch": 0.33,
"learning_rate": 1.6939194840220497e-05,
"loss": 1.0727,
"step": 375
},
{
"epoch": 0.34,
"learning_rate": 1.6827488552151855e-05,
"loss": 1.0532,
"step": 380
},
{
"epoch": 0.34,
"learning_rate": 1.671416309647136e-05,
"loss": 1.027,
"step": 385
},
{
"epoch": 0.34,
"learning_rate": 1.659924534878723e-05,
"loss": 1.0237,
"step": 390
},
{
"epoch": 0.35,
"learning_rate": 1.6482762562326414e-05,
"loss": 0.9972,
"step": 395
},
{
"epoch": 0.35,
"learning_rate": 1.6364742361471416e-05,
"loss": 1.0156,
"step": 400
},
{
"epoch": 0.36,
"learning_rate": 1.6245212735208994e-05,
"loss": 1.0124,
"step": 405
},
{
"epoch": 0.36,
"learning_rate": 1.61242020304925e-05,
"loss": 1.0392,
"step": 410
},
{
"epoch": 0.37,
"learning_rate": 1.6001738945519278e-05,
"loss": 1.0376,
"step": 415
},
{
"epoch": 0.37,
"learning_rate": 1.5877852522924733e-05,
"loss": 1.0139,
"step": 420
},
{
"epoch": 0.37,
"learning_rate": 1.575257214289479e-05,
"loss": 1.0209,
"step": 425
},
{
"epoch": 0.38,
"learning_rate": 1.5625927516198235e-05,
"loss": 1.0046,
"step": 430
},
{
"epoch": 0.38,
"learning_rate": 1.5497948677140673e-05,
"loss": 0.9956,
"step": 435
},
{
"epoch": 0.39,
"learning_rate": 1.5368665976441802e-05,
"loss": 0.9862,
"step": 440
},
{
"epoch": 0.39,
"learning_rate": 1.523811007403757e-05,
"loss": 1.0144,
"step": 445
},
{
"epoch": 0.4,
"learning_rate": 1.510631193180907e-05,
"loss": 1.0134,
"step": 450
},
{
"epoch": 0.4,
"learning_rate": 1.4973302806239796e-05,
"loss": 0.9988,
"step": 455
},
{
"epoch": 0.41,
"learning_rate": 1.4839114241003017e-05,
"loss": 0.993,
"step": 460
},
{
"epoch": 0.41,
"learning_rate": 1.4703778059481096e-05,
"loss": 0.9854,
"step": 465
},
{
"epoch": 0.41,
"learning_rate": 1.4567326357218408e-05,
"loss": 0.9916,
"step": 470
},
{
"epoch": 0.42,
"learning_rate": 1.4429791494309769e-05,
"loss": 0.9933,
"step": 475
},
{
"epoch": 0.42,
"learning_rate": 1.429120608772609e-05,
"loss": 0.9819,
"step": 480
},
{
"epoch": 0.43,
"learning_rate": 1.415160300357914e-05,
"loss": 1.0046,
"step": 485
},
{
"epoch": 0.43,
"learning_rate": 1.4011015349327188e-05,
"loss": 0.9986,
"step": 490
},
{
"epoch": 0.44,
"learning_rate": 1.3869476465923455e-05,
"loss": 0.9993,
"step": 495
},
{
"epoch": 0.44,
"learning_rate": 1.372701991990914e-05,
"loss": 0.9845,
"step": 500
},
{
"epoch": 0.45,
"learning_rate": 1.3583679495453e-05,
"loss": 0.9756,
"step": 505
},
{
"epoch": 0.45,
"learning_rate": 1.3439489186339283e-05,
"loss": 0.9789,
"step": 510
},
{
"epoch": 0.45,
"learning_rate": 1.3294483187905938e-05,
"loss": 0.9969,
"step": 515
},
{
"epoch": 0.46,
"learning_rate": 1.314869588893508e-05,
"loss": 0.9677,
"step": 520
},
{
"epoch": 0.46,
"learning_rate": 1.3002161863497529e-05,
"loss": 0.9821,
"step": 525
},
{
"epoch": 0.47,
"learning_rate": 1.2854915862753424e-05,
"loss": 0.9769,
"step": 530
},
{
"epoch": 0.47,
"learning_rate": 1.2706992806710839e-05,
"loss": 0.982,
"step": 535
},
{
"epoch": 0.48,
"learning_rate": 1.2558427775944357e-05,
"loss": 0.9792,
"step": 540
},
{
"epoch": 0.48,
"learning_rate": 1.2409256003275576e-05,
"loss": 0.9977,
"step": 545
},
{
"epoch": 0.49,
"learning_rate": 1.2259512865417478e-05,
"loss": 0.9833,
"step": 550
},
{
"epoch": 0.49,
"learning_rate": 1.21092338745847e-05,
"loss": 0.9789,
"step": 555
},
{
"epoch": 0.49,
"learning_rate": 1.1958454670071671e-05,
"loss": 0.9838,
"step": 560
},
{
"epoch": 0.5,
"learning_rate": 1.1807211009800592e-05,
"loss": 0.9623,
"step": 565
},
{
"epoch": 0.5,
"learning_rate": 1.16555387618413e-05,
"loss": 0.9786,
"step": 570
},
{
"epoch": 0.51,
"learning_rate": 1.1503473895905009e-05,
"loss": 0.9811,
"step": 575
},
{
"epoch": 0.51,
"learning_rate": 1.135105247481393e-05,
"loss": 0.9707,
"step": 580
},
{
"epoch": 0.52,
"learning_rate": 1.1198310645948833e-05,
"loss": 0.9825,
"step": 585
},
{
"epoch": 0.52,
"learning_rate": 1.1045284632676535e-05,
"loss": 0.943,
"step": 590
},
{
"epoch": 0.52,
"learning_rate": 1.0892010725759384e-05,
"loss": 0.981,
"step": 595
},
{
"epoch": 0.53,
"learning_rate": 1.073852527474874e-05,
"loss": 0.9732,
"step": 600
},
{
"epoch": 0.53,
"learning_rate": 1.0584864679364546e-05,
"loss": 0.952,
"step": 605
},
{
"epoch": 0.54,
"learning_rate": 1.0431065380862959e-05,
"loss": 0.9683,
"step": 610
},
{
"epoch": 0.54,
"learning_rate": 1.0277163853394166e-05,
"loss": 0.9479,
"step": 615
},
{
"epoch": 0.55,
"learning_rate": 1.0123196595352385e-05,
"loss": 0.9826,
"step": 620
},
{
"epoch": 0.55,
"learning_rate": 9.96920012072012e-06,
"loss": 0.9723,
"step": 625
},
{
"epoch": 0.56,
"learning_rate": 9.815210950408703e-06,
"loss": 0.9632,
"step": 630
},
{
"epoch": 0.56,
"learning_rate": 9.661265603597223e-06,
"loss": 0.9551,
"step": 635
},
{
"epoch": 0.56,
"learning_rate": 9.507400589071833e-06,
"loss": 0.9668,
"step": 640
},
{
"epoch": 0.57,
"learning_rate": 9.353652396567558e-06,
"loss": 0.9498,
"step": 645
},
{
"epoch": 0.57,
"learning_rate": 9.200057488114585e-06,
"loss": 0.9597,
"step": 650
},
{
"epoch": 0.58,
"learning_rate": 9.04665228939114e-06,
"loss": 0.9802,
"step": 655
},
{
"epoch": 0.58,
"learning_rate": 8.893473181084993e-06,
"loss": 0.978,
"step": 660
},
{
"epoch": 0.59,
"learning_rate": 8.740556490265621e-06,
"loss": 0.9744,
"step": 665
},
{
"epoch": 0.59,
"learning_rate": 8.58793848176909e-06,
"loss": 0.9602,
"step": 670
},
{
"epoch": 0.6,
"learning_rate": 8.43565534959769e-06,
"loss": 0.9641,
"step": 675
},
{
"epoch": 0.6,
"learning_rate": 8.283743208336403e-06,
"loss": 0.9695,
"step": 680
},
{
"epoch": 0.6,
"learning_rate": 8.13223808458814e-06,
"loss": 0.96,
"step": 685
},
{
"epoch": 0.61,
"learning_rate": 7.9811759084299e-06,
"loss": 0.9601,
"step": 690
},
{
"epoch": 0.61,
"learning_rate": 7.8305925048918e-06,
"loss": 0.9455,
"step": 695
},
{
"epoch": 0.62,
"learning_rate": 7.680523585461021e-06,
"loss": 0.9624,
"step": 700
},
{
"epoch": 0.62,
"learning_rate": 7.531004739612668e-06,
"loss": 0.9691,
"step": 705
},
{
"epoch": 0.63,
"learning_rate": 7.382071426369597e-06,
"loss": 0.9533,
"step": 710
},
{
"epoch": 0.63,
"learning_rate": 7.23375896589313e-06,
"loss": 0.9578,
"step": 715
},
{
"epoch": 0.63,
"learning_rate": 7.086102531106755e-06,
"loss": 0.9647,
"step": 720
},
{
"epoch": 0.64,
"learning_rate": 6.939137139354704e-06,
"loss": 0.9565,
"step": 725
},
{
"epoch": 0.64,
"learning_rate": 6.7928976440974504e-06,
"loss": 0.9504,
"step": 730
},
{
"epoch": 0.65,
"learning_rate": 6.647418726646065e-06,
"loss": 0.9621,
"step": 735
},
{
"epoch": 0.65,
"learning_rate": 6.502734887937389e-06,
"loss": 0.9562,
"step": 740
},
{
"epoch": 0.66,
"learning_rate": 6.3588804403520044e-06,
"loss": 0.9438,
"step": 745
},
{
"epoch": 0.66,
"learning_rate": 6.215889499576898e-06,
"loss": 0.962,
"step": 750
},
{
"epoch": 0.67,
"learning_rate": 6.073795976514785e-06,
"loss": 0.9606,
"step": 755
},
{
"epoch": 0.67,
"learning_rate": 5.932633569242e-06,
"loss": 0.9284,
"step": 760
},
{
"epoch": 0.67,
"learning_rate": 5.7924357550168534e-06,
"loss": 0.9583,
"step": 765
},
{
"epoch": 0.68,
"learning_rate": 5.653235782340351e-06,
"loss": 0.9583,
"step": 770
},
{
"epoch": 0.68,
"learning_rate": 5.515066663071199e-06,
"loss": 0.9197,
"step": 775
},
{
"epoch": 0.69,
"learning_rate": 5.3779611645968696e-06,
"loss": 0.9554,
"step": 780
},
{
"epoch": 0.69,
"learning_rate": 5.241951802062696e-06,
"loss": 0.9463,
"step": 785
},
{
"epoch": 0.7,
"learning_rate": 5.107070830660765e-06,
"loss": 0.9402,
"step": 790
},
{
"epoch": 0.7,
"learning_rate": 4.973350237980466e-06,
"loss": 0.951,
"step": 795
},
{
"epoch": 0.71,
"learning_rate": 4.8408217364224886e-06,
"loss": 0.9491,
"step": 800
},
{
"epoch": 0.71,
"learning_rate": 4.709516755678113e-06,
"loss": 0.9496,
"step": 805
},
{
"epoch": 0.71,
"learning_rate": 4.579466435275506e-06,
"loss": 0.9454,
"step": 810
},
{
"epoch": 0.72,
"learning_rate": 4.450701617194864e-06,
"loss": 0.9669,
"step": 815
},
{
"epoch": 0.72,
"learning_rate": 4.323252838554099e-06,
"loss": 0.9583,
"step": 820
},
{
"epoch": 0.73,
"learning_rate": 4.197150324366844e-06,
"loss": 0.9417,
"step": 825
},
{
"epoch": 0.73,
"learning_rate": 4.0724239803744524e-06,
"loss": 0.9574,
"step": 830
},
{
"epoch": 0.74,
"learning_rate": 3.949103385953732e-06,
"loss": 0.97,
"step": 835
},
{
"epoch": 0.74,
"learning_rate": 3.827217787102072e-06,
"loss": 0.9531,
"step": 840
},
{
"epoch": 0.75,
"learning_rate": 3.7067960895016277e-06,
"loss": 0.9292,
"step": 845
},
{
"epoch": 0.75,
"learning_rate": 3.587866851664219e-06,
"loss": 0.9357,
"step": 850
},
{
"epoch": 0.75,
"learning_rate": 3.4704582781585596e-06,
"loss": 0.9451,
"step": 855
},
{
"epoch": 0.76,
"learning_rate": 3.3545982129214227e-06,
"loss": 0.9502,
"step": 860
},
{
"epoch": 0.76,
"learning_rate": 3.2403141326543365e-06,
"loss": 0.9528,
"step": 865
},
{
"epoch": 0.77,
"learning_rate": 3.1276331403073733e-06,
"loss": 0.9545,
"step": 870
},
{
"epoch": 0.77,
"learning_rate": 3.016581958651559e-06,
"loss": 0.9563,
"step": 875
},
{
"epoch": 0.78,
"learning_rate": 2.907186923941466e-06,
"loss": 0.9422,
"step": 880
},
{
"epoch": 0.78,
"learning_rate": 2.799473979669456e-06,
"loss": 0.9539,
"step": 885
},
{
"epoch": 0.78,
"learning_rate": 2.6934686704130698e-06,
"loss": 0.9589,
"step": 890
},
{
"epoch": 0.79,
"learning_rate": 2.5891961357770267e-06,
"loss": 0.9447,
"step": 895
},
{
"epoch": 0.79,
"learning_rate": 2.4866811044312667e-06,
"loss": 0.943,
"step": 900
},
{
"epoch": 0.8,
"learning_rate": 2.3859478882464273e-06,
"loss": 0.9417,
"step": 905
},
{
"epoch": 0.8,
"learning_rate": 2.287020376528193e-06,
"loss": 0.9289,
"step": 910
},
{
"epoch": 0.81,
"learning_rate": 2.1899220303518465e-06,
"loss": 0.9407,
"step": 915
},
{
"epoch": 0.81,
"learning_rate": 2.0946758769983666e-06,
"loss": 0.9457,
"step": 920
},
{
"epoch": 0.82,
"learning_rate": 2.0013045044934243e-06,
"loss": 0.9301,
"step": 925
},
{
"epoch": 0.82,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.9405,
"step": 930
},
{
"epoch": 0.82,
"learning_rate": 1.8202742258196183e-06,
"loss": 0.9345,
"step": 935
},
{
"epoch": 0.83,
"learning_rate": 1.7326582517423662e-06,
"loss": 0.9381,
"step": 940
},
{
"epoch": 0.83,
"learning_rate": 1.6470029125153463e-06,
"loss": 0.9382,
"step": 945
},
{
"epoch": 0.84,
"learning_rate": 1.5633285216623384e-06,
"loss": 0.9548,
"step": 950
},
{
"epoch": 0.84,
"learning_rate": 1.4816549229168864e-06,
"loss": 0.9384,
"step": 955
},
{
"epoch": 0.85,
"learning_rate": 1.4020014855162755e-06,
"loss": 0.9395,
"step": 960
},
{
"epoch": 0.85,
"learning_rate": 1.32438709960804e-06,
"loss": 0.9368,
"step": 965
},
{
"epoch": 0.86,
"learning_rate": 1.2488301717700735e-06,
"loss": 0.9434,
"step": 970
},
{
"epoch": 0.86,
"learning_rate": 1.1753486206454433e-06,
"loss": 0.9372,
"step": 975
},
{
"epoch": 0.86,
"learning_rate": 1.1039598726929046e-06,
"loss": 0.9348,
"step": 980
},
{
"epoch": 0.87,
"learning_rate": 1.0346808580541411e-06,
"loss": 0.9426,
"step": 985
},
{
"epoch": 0.87,
"learning_rate": 9.675280065387117e-07,
"loss": 0.9423,
"step": 990
},
{
"epoch": 0.88,
"learning_rate": 9.02517243727653e-07,
"loss": 0.9335,
"step": 995
},
{
"epoch": 0.88,
"learning_rate": 8.39663987196665e-07,
"loss": 0.9263,
"step": 1000
},
{
"epoch": 0.89,
"learning_rate": 7.78983142859755e-07,
"loss": 0.9433,
"step": 1005
},
{
"epoch": 0.89,
"learning_rate": 7.204891014342552e-07,
"loss": 0.9473,
"step": 1010
},
{
"epoch": 0.9,
"learning_rate": 6.641957350279838e-07,
"loss": 0.9459,
"step": 1015
},
{
"epoch": 0.9,
"learning_rate": 6.101163938494359e-07,
"loss": 0.9526,
"step": 1020
},
{
"epoch": 0.9,
"learning_rate": 5.582639030417114e-07,
"loss": 0.9232,
"step": 1025
},
{
"epoch": 0.91,
"learning_rate": 5.086505596409885e-07,
"loss": 0.9597,
"step": 1030
},
{
"epoch": 0.91,
"learning_rate": 4.6128812966021894e-07,
"loss": 0.9503,
"step": 1035
},
{
"epoch": 0.92,
"learning_rate": 4.161878452987778e-07,
"loss": 0.9484,
"step": 1040
},
{
"epoch": 0.92,
"learning_rate": 3.733604022786963e-07,
"loss": 0.9475,
"step": 1045
},
{
"epoch": 0.93,
"learning_rate": 3.328159573081258e-07,
"loss": 0.9354,
"step": 1050
},
{
"epoch": 0.93,
"learning_rate": 2.9456412567263394e-07,
"loss": 0.9559,
"step": 1055
},
{
"epoch": 0.93,
"learning_rate": 2.5861397895489914e-07,
"loss": 0.9425,
"step": 1060
},
{
"epoch": 0.94,
"learning_rate": 2.2497404288334245e-07,
"loss": 0.9456,
"step": 1065
},
{
"epoch": 0.94,
"learning_rate": 1.9365229531022267e-07,
"loss": 0.9462,
"step": 1070
},
{
"epoch": 0.95,
"learning_rate": 1.646561643196465e-07,
"loss": 0.9515,
"step": 1075
},
{
"epoch": 0.95,
"learning_rate": 1.3799252646597428e-07,
"loss": 0.9428,
"step": 1080
},
{
"epoch": 0.96,
"learning_rate": 1.136677051430135e-07,
"loss": 0.9593,
"step": 1085
},
{
"epoch": 0.96,
"learning_rate": 9.168746908439718e-08,
"loss": 0.9402,
"step": 1090
},
{
"epoch": 0.97,
"learning_rate": 7.205703099551042e-08,
"loss": 0.9397,
"step": 1095
},
{
"epoch": 0.97,
"learning_rate": 5.4781046317267103e-08,
"loss": 0.9444,
"step": 1100
},
{
"epoch": 0.97,
"learning_rate": 3.986361212205969e-08,
"loss": 0.9399,
"step": 1105
},
{
"epoch": 0.98,
"learning_rate": 2.7308266142119788e-08,
"loss": 0.933,
"step": 1110
},
{
"epoch": 0.98,
"learning_rate": 1.7117985930525938e-08,
"loss": 0.9344,
"step": 1115
},
{
"epoch": 0.99,
"learning_rate": 9.29518815506847e-09,
"loss": 0.9505,
"step": 1120
},
{
"epoch": 0.99,
"learning_rate": 3.8417280251257235e-09,
"loss": 0.9476,
"step": 1125
},
{
"epoch": 1.0,
"learning_rate": 7.588988516937789e-10,
"loss": 0.9313,
"step": 1130
},
{
"epoch": 1.0,
"eval_loss": 0.9658820033073425,
"eval_runtime": 299.772,
"eval_samples_per_second": 51.586,
"eval_steps_per_second": 0.807,
"step": 1134
},
{
"epoch": 1.0,
"step": 1134,
"total_flos": 1.2682771336104247e+19,
"train_loss": 1.0355008869995306,
"train_runtime": 11406.2159,
"train_samples_per_second": 12.721,
"train_steps_per_second": 0.099
}
],
"logging_steps": 5,
"max_steps": 1134,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 1.2682771336104247e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}