chansung's picture
Model save
984f772 verified
raw
history blame contribute delete
No virus
106 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 2960,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033783783783783786,
"grad_norm": 9.802501678466797,
"learning_rate": 6.756756756756758e-07,
"loss": 2.6921,
"step": 1
},
{
"epoch": 0.016891891891891893,
"grad_norm": 10.374316215515137,
"learning_rate": 3.3783783783783788e-06,
"loss": 2.7409,
"step": 5
},
{
"epoch": 0.033783783783783786,
"grad_norm": 9.445246696472168,
"learning_rate": 6.7567567567567575e-06,
"loss": 2.6534,
"step": 10
},
{
"epoch": 0.05067567567567568,
"grad_norm": 3.71943736076355,
"learning_rate": 1.0135135135135136e-05,
"loss": 2.4339,
"step": 15
},
{
"epoch": 0.06756756756756757,
"grad_norm": 1.7139111757278442,
"learning_rate": 1.3513513513513515e-05,
"loss": 2.2659,
"step": 20
},
{
"epoch": 0.08445945945945946,
"grad_norm": 0.7590915560722351,
"learning_rate": 1.6891891891891892e-05,
"loss": 2.1065,
"step": 25
},
{
"epoch": 0.10135135135135136,
"grad_norm": 0.6881681680679321,
"learning_rate": 2.0270270270270273e-05,
"loss": 1.9905,
"step": 30
},
{
"epoch": 0.11824324324324324,
"grad_norm": 0.6322100162506104,
"learning_rate": 2.364864864864865e-05,
"loss": 1.8675,
"step": 35
},
{
"epoch": 0.13513513513513514,
"grad_norm": 0.6217833757400513,
"learning_rate": 2.702702702702703e-05,
"loss": 1.7354,
"step": 40
},
{
"epoch": 0.15202702702702703,
"grad_norm": 0.4574459493160248,
"learning_rate": 3.0405405405405407e-05,
"loss": 1.6276,
"step": 45
},
{
"epoch": 0.16891891891891891,
"grad_norm": 0.3643452525138855,
"learning_rate": 3.3783783783783784e-05,
"loss": 1.5622,
"step": 50
},
{
"epoch": 0.1858108108108108,
"grad_norm": 0.6475837230682373,
"learning_rate": 3.7162162162162165e-05,
"loss": 1.5175,
"step": 55
},
{
"epoch": 0.20270270270270271,
"grad_norm": 0.29574820399284363,
"learning_rate": 4.0540540540540545e-05,
"loss": 1.4953,
"step": 60
},
{
"epoch": 0.2195945945945946,
"grad_norm": 0.31240248680114746,
"learning_rate": 4.391891891891892e-05,
"loss": 1.4493,
"step": 65
},
{
"epoch": 0.23648648648648649,
"grad_norm": 0.2868952751159668,
"learning_rate": 4.72972972972973e-05,
"loss": 1.4327,
"step": 70
},
{
"epoch": 0.2533783783783784,
"grad_norm": 0.3093927800655365,
"learning_rate": 5.067567567567568e-05,
"loss": 1.4149,
"step": 75
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.37402284145355225,
"learning_rate": 5.405405405405406e-05,
"loss": 1.3976,
"step": 80
},
{
"epoch": 0.28716216216216217,
"grad_norm": 0.34647682309150696,
"learning_rate": 5.7432432432432434e-05,
"loss": 1.3977,
"step": 85
},
{
"epoch": 0.30405405405405406,
"grad_norm": 0.30035659670829773,
"learning_rate": 6.0810810810810814e-05,
"loss": 1.3589,
"step": 90
},
{
"epoch": 0.32094594594594594,
"grad_norm": 0.33794718980789185,
"learning_rate": 6.41891891891892e-05,
"loss": 1.3818,
"step": 95
},
{
"epoch": 0.33783783783783783,
"grad_norm": 0.40184497833251953,
"learning_rate": 6.756756756756757e-05,
"loss": 1.3577,
"step": 100
},
{
"epoch": 0.3547297297297297,
"grad_norm": 0.32776907086372375,
"learning_rate": 7.094594594594594e-05,
"loss": 1.3408,
"step": 105
},
{
"epoch": 0.3716216216216216,
"grad_norm": 0.32861512899398804,
"learning_rate": 7.432432432432433e-05,
"loss": 1.3036,
"step": 110
},
{
"epoch": 0.3885135135135135,
"grad_norm": 0.3542137145996094,
"learning_rate": 7.77027027027027e-05,
"loss": 1.3261,
"step": 115
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.3485589921474457,
"learning_rate": 8.108108108108109e-05,
"loss": 1.3107,
"step": 120
},
{
"epoch": 0.4222972972972973,
"grad_norm": 0.3495419919490814,
"learning_rate": 8.445945945945946e-05,
"loss": 1.2784,
"step": 125
},
{
"epoch": 0.4391891891891892,
"grad_norm": 0.3283160626888275,
"learning_rate": 8.783783783783784e-05,
"loss": 1.2816,
"step": 130
},
{
"epoch": 0.4560810810810811,
"grad_norm": 0.331221342086792,
"learning_rate": 9.121621621621623e-05,
"loss": 1.2697,
"step": 135
},
{
"epoch": 0.47297297297297297,
"grad_norm": 0.38272470235824585,
"learning_rate": 9.45945945945946e-05,
"loss": 1.2806,
"step": 140
},
{
"epoch": 0.48986486486486486,
"grad_norm": 0.3326016962528229,
"learning_rate": 9.797297297297297e-05,
"loss": 1.2729,
"step": 145
},
{
"epoch": 0.5067567567567568,
"grad_norm": 0.31695079803466797,
"learning_rate": 0.00010135135135135136,
"loss": 1.2657,
"step": 150
},
{
"epoch": 0.5236486486486487,
"grad_norm": 0.40642571449279785,
"learning_rate": 0.00010472972972972975,
"loss": 1.2454,
"step": 155
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.3561699688434601,
"learning_rate": 0.00010810810810810812,
"loss": 1.2414,
"step": 160
},
{
"epoch": 0.5574324324324325,
"grad_norm": 0.30583736300468445,
"learning_rate": 0.0001114864864864865,
"loss": 1.2473,
"step": 165
},
{
"epoch": 0.5743243243243243,
"grad_norm": 0.3610832393169403,
"learning_rate": 0.00011486486486486487,
"loss": 1.2487,
"step": 170
},
{
"epoch": 0.5912162162162162,
"grad_norm": 0.33005717396736145,
"learning_rate": 0.00011824324324324326,
"loss": 1.2512,
"step": 175
},
{
"epoch": 0.6081081081081081,
"grad_norm": 0.3080041706562042,
"learning_rate": 0.00012162162162162163,
"loss": 1.2544,
"step": 180
},
{
"epoch": 0.625,
"grad_norm": 0.3453957736492157,
"learning_rate": 0.000125,
"loss": 1.2329,
"step": 185
},
{
"epoch": 0.6418918918918919,
"grad_norm": 0.4040939211845398,
"learning_rate": 0.0001283783783783784,
"loss": 1.2356,
"step": 190
},
{
"epoch": 0.6587837837837838,
"grad_norm": 0.39047908782958984,
"learning_rate": 0.00013175675675675675,
"loss": 1.2215,
"step": 195
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.27441543340682983,
"learning_rate": 0.00013513513513513514,
"loss": 1.2374,
"step": 200
},
{
"epoch": 0.6925675675675675,
"grad_norm": 0.26817697286605835,
"learning_rate": 0.00013851351351351352,
"loss": 1.2446,
"step": 205
},
{
"epoch": 0.7094594594594594,
"grad_norm": 0.4692605435848236,
"learning_rate": 0.00014189189189189188,
"loss": 1.2369,
"step": 210
},
{
"epoch": 0.7263513513513513,
"grad_norm": 0.47006717324256897,
"learning_rate": 0.00014527027027027027,
"loss": 1.2289,
"step": 215
},
{
"epoch": 0.7432432432432432,
"grad_norm": 0.26643019914627075,
"learning_rate": 0.00014864864864864866,
"loss": 1.2272,
"step": 220
},
{
"epoch": 0.7601351351351351,
"grad_norm": 0.27256107330322266,
"learning_rate": 0.00015202702702702702,
"loss": 1.2301,
"step": 225
},
{
"epoch": 0.777027027027027,
"grad_norm": 0.2612285912036896,
"learning_rate": 0.0001554054054054054,
"loss": 1.2303,
"step": 230
},
{
"epoch": 0.793918918918919,
"grad_norm": 0.2759920656681061,
"learning_rate": 0.0001587837837837838,
"loss": 1.2177,
"step": 235
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.29133257269859314,
"learning_rate": 0.00016216216216216218,
"loss": 1.2174,
"step": 240
},
{
"epoch": 0.8277027027027027,
"grad_norm": 0.3231314420700073,
"learning_rate": 0.00016554054054054057,
"loss": 1.2036,
"step": 245
},
{
"epoch": 0.8445945945945946,
"grad_norm": 0.27160102128982544,
"learning_rate": 0.00016891891891891893,
"loss": 1.2302,
"step": 250
},
{
"epoch": 0.8614864864864865,
"grad_norm": 0.29660171270370483,
"learning_rate": 0.00017229729729729732,
"loss": 1.2033,
"step": 255
},
{
"epoch": 0.8783783783783784,
"grad_norm": 0.2654610276222229,
"learning_rate": 0.00017567567567567568,
"loss": 1.2012,
"step": 260
},
{
"epoch": 0.8952702702702703,
"grad_norm": 0.28142857551574707,
"learning_rate": 0.00017905405405405406,
"loss": 1.2052,
"step": 265
},
{
"epoch": 0.9121621621621622,
"grad_norm": 0.24720372259616852,
"learning_rate": 0.00018243243243243245,
"loss": 1.192,
"step": 270
},
{
"epoch": 0.9290540540540541,
"grad_norm": 0.2735718786716461,
"learning_rate": 0.0001858108108108108,
"loss": 1.213,
"step": 275
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.30433931946754456,
"learning_rate": 0.0001891891891891892,
"loss": 1.2059,
"step": 280
},
{
"epoch": 0.9628378378378378,
"grad_norm": 0.3330329358577728,
"learning_rate": 0.00019256756756756758,
"loss": 1.206,
"step": 285
},
{
"epoch": 0.9797297297297297,
"grad_norm": 0.27602413296699524,
"learning_rate": 0.00019594594594594594,
"loss": 1.2043,
"step": 290
},
{
"epoch": 0.9966216216216216,
"grad_norm": 0.23838359117507935,
"learning_rate": 0.00019932432432432433,
"loss": 1.2062,
"step": 295
},
{
"epoch": 1.0,
"eval_loss": 1.6780701875686646,
"eval_runtime": 0.3945,
"eval_samples_per_second": 5.07,
"eval_steps_per_second": 2.535,
"step": 296
},
{
"epoch": 1.0135135135135136,
"grad_norm": 0.3066512644290924,
"learning_rate": 0.00019999888744757143,
"loss": 1.1826,
"step": 300
},
{
"epoch": 1.0304054054054055,
"grad_norm": 0.42127561569213867,
"learning_rate": 0.0001999943677457578,
"loss": 1.1683,
"step": 305
},
{
"epoch": 1.0472972972972974,
"grad_norm": 0.28215768933296204,
"learning_rate": 0.000199986371517049,
"loss": 1.1752,
"step": 310
},
{
"epoch": 1.0641891891891893,
"grad_norm": 0.35595354437828064,
"learning_rate": 0.0001999748990394517,
"loss": 1.1515,
"step": 315
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.23858019709587097,
"learning_rate": 0.0001999599507118322,
"loss": 1.1604,
"step": 320
},
{
"epoch": 1.097972972972973,
"grad_norm": 0.2836330831050873,
"learning_rate": 0.0001999415270539023,
"loss": 1.1714,
"step": 325
},
{
"epoch": 1.114864864864865,
"grad_norm": 0.28962427377700806,
"learning_rate": 0.00019991962870620153,
"loss": 1.1693,
"step": 330
},
{
"epoch": 1.1317567567567568,
"grad_norm": 0.2537465989589691,
"learning_rate": 0.00019989425643007476,
"loss": 1.1537,
"step": 335
},
{
"epoch": 1.1486486486486487,
"grad_norm": 0.23751677572727203,
"learning_rate": 0.00019986541110764565,
"loss": 1.1664,
"step": 340
},
{
"epoch": 1.1655405405405406,
"grad_norm": 0.3039610981941223,
"learning_rate": 0.0001998330937417861,
"loss": 1.1607,
"step": 345
},
{
"epoch": 1.1824324324324325,
"grad_norm": 0.22566653788089752,
"learning_rate": 0.00019979730545608126,
"loss": 1.1532,
"step": 350
},
{
"epoch": 1.1993243243243243,
"grad_norm": 0.27842891216278076,
"learning_rate": 0.00019975804749479062,
"loss": 1.1589,
"step": 355
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.2455698400735855,
"learning_rate": 0.00019971532122280464,
"loss": 1.1608,
"step": 360
},
{
"epoch": 1.2331081081081081,
"grad_norm": 0.23679549992084503,
"learning_rate": 0.00019966912812559732,
"loss": 1.1691,
"step": 365
},
{
"epoch": 1.25,
"grad_norm": 0.22320061922073364,
"learning_rate": 0.00019961946980917456,
"loss": 1.1551,
"step": 370
},
{
"epoch": 1.2668918918918919,
"grad_norm": 0.2794288992881775,
"learning_rate": 0.00019956634800001832,
"loss": 1.1667,
"step": 375
},
{
"epoch": 1.2837837837837838,
"grad_norm": 0.2269154042005539,
"learning_rate": 0.0001995097645450266,
"loss": 1.1589,
"step": 380
},
{
"epoch": 1.3006756756756757,
"grad_norm": 0.22751463949680328,
"learning_rate": 0.00019944972141144928,
"loss": 1.1522,
"step": 385
},
{
"epoch": 1.3175675675675675,
"grad_norm": 0.2368728667497635,
"learning_rate": 0.00019938622068681953,
"loss": 1.1487,
"step": 390
},
{
"epoch": 1.3344594594594594,
"grad_norm": 0.2409171611070633,
"learning_rate": 0.00019931926457888156,
"loss": 1.1575,
"step": 395
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.24245265126228333,
"learning_rate": 0.0001992488554155135,
"loss": 1.1443,
"step": 400
},
{
"epoch": 1.3682432432432432,
"grad_norm": 0.21953873336315155,
"learning_rate": 0.0001991749956446468,
"loss": 1.1578,
"step": 405
},
{
"epoch": 1.385135135135135,
"grad_norm": 0.21402998268604279,
"learning_rate": 0.00019909768783418086,
"loss": 1.1655,
"step": 410
},
{
"epoch": 1.402027027027027,
"grad_norm": 0.22115997970104218,
"learning_rate": 0.00019901693467189386,
"loss": 1.1515,
"step": 415
},
{
"epoch": 1.4189189189189189,
"grad_norm": 0.2362441122531891,
"learning_rate": 0.00019893273896534936,
"loss": 1.1579,
"step": 420
},
{
"epoch": 1.4358108108108107,
"grad_norm": 0.2779642641544342,
"learning_rate": 0.0001988451036417986,
"loss": 1.1518,
"step": 425
},
{
"epoch": 1.4527027027027026,
"grad_norm": 0.22553110122680664,
"learning_rate": 0.00019875403174807882,
"loss": 1.1722,
"step": 430
},
{
"epoch": 1.4695945945945945,
"grad_norm": 0.22423289716243744,
"learning_rate": 0.0001986595264505072,
"loss": 1.1628,
"step": 435
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.23659999668598175,
"learning_rate": 0.00019856159103477086,
"loss": 1.1442,
"step": 440
},
{
"epoch": 1.5033783783783785,
"grad_norm": 0.23966625332832336,
"learning_rate": 0.00019846022890581267,
"loss": 1.1486,
"step": 445
},
{
"epoch": 1.5202702702702702,
"grad_norm": 0.2399033010005951,
"learning_rate": 0.0001983554435877128,
"loss": 1.144,
"step": 450
},
{
"epoch": 1.5371621621621623,
"grad_norm": 0.2575773000717163,
"learning_rate": 0.0001982472387235662,
"loss": 1.1693,
"step": 455
},
{
"epoch": 1.554054054054054,
"grad_norm": 0.23619942367076874,
"learning_rate": 0.00019813561807535598,
"loss": 1.1494,
"step": 460
},
{
"epoch": 1.570945945945946,
"grad_norm": 0.24643085896968842,
"learning_rate": 0.0001980205855238225,
"loss": 1.1543,
"step": 465
},
{
"epoch": 1.5878378378378377,
"grad_norm": 0.2060076743364334,
"learning_rate": 0.00019790214506832868,
"loss": 1.1597,
"step": 470
},
{
"epoch": 1.6047297297297298,
"grad_norm": 0.20906926691532135,
"learning_rate": 0.00019778030082672068,
"loss": 1.1393,
"step": 475
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.21041174232959747,
"learning_rate": 0.00019765505703518496,
"loss": 1.1519,
"step": 480
},
{
"epoch": 1.6385135135135136,
"grad_norm": 0.21494755148887634,
"learning_rate": 0.00019752641804810084,
"loss": 1.1497,
"step": 485
},
{
"epoch": 1.6554054054054053,
"grad_norm": 0.21202711760997772,
"learning_rate": 0.0001973943883378892,
"loss": 1.1579,
"step": 490
},
{
"epoch": 1.6722972972972974,
"grad_norm": 0.20677632093429565,
"learning_rate": 0.00019725897249485704,
"loss": 1.1473,
"step": 495
},
{
"epoch": 1.689189189189189,
"grad_norm": 0.2177901715040207,
"learning_rate": 0.00019712017522703764,
"loss": 1.154,
"step": 500
},
{
"epoch": 1.7060810810810811,
"grad_norm": 0.212003692984581,
"learning_rate": 0.0001969780013600272,
"loss": 1.1608,
"step": 505
},
{
"epoch": 1.722972972972973,
"grad_norm": 0.21401935815811157,
"learning_rate": 0.00019683245583681675,
"loss": 1.1619,
"step": 510
},
{
"epoch": 1.739864864864865,
"grad_norm": 0.22224700450897217,
"learning_rate": 0.00019668354371762066,
"loss": 1.1565,
"step": 515
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.2198743373155594,
"learning_rate": 0.00019653127017970034,
"loss": 1.148,
"step": 520
},
{
"epoch": 1.7736486486486487,
"grad_norm": 0.21117670834064484,
"learning_rate": 0.0001963756405171845,
"loss": 1.1567,
"step": 525
},
{
"epoch": 1.7905405405405406,
"grad_norm": 0.23106643557548523,
"learning_rate": 0.00019621666014088494,
"loss": 1.1417,
"step": 530
},
{
"epoch": 1.8074324324324325,
"grad_norm": 0.20598255097866058,
"learning_rate": 0.00019605433457810855,
"loss": 1.1491,
"step": 535
},
{
"epoch": 1.8243243243243243,
"grad_norm": 0.2185199111700058,
"learning_rate": 0.00019588866947246498,
"loss": 1.1474,
"step": 540
},
{
"epoch": 1.8412162162162162,
"grad_norm": 0.21996720135211945,
"learning_rate": 0.00019571967058367064,
"loss": 1.1574,
"step": 545
},
{
"epoch": 1.8581081081081081,
"grad_norm": 0.205213725566864,
"learning_rate": 0.00019554734378734824,
"loss": 1.1596,
"step": 550
},
{
"epoch": 1.875,
"grad_norm": 0.19933567941188812,
"learning_rate": 0.0001953716950748227,
"loss": 1.1466,
"step": 555
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.19704587757587433,
"learning_rate": 0.00019519273055291266,
"loss": 1.1399,
"step": 560
},
{
"epoch": 1.9087837837837838,
"grad_norm": 0.20990757644176483,
"learning_rate": 0.00019501045644371832,
"loss": 1.1363,
"step": 565
},
{
"epoch": 1.9256756756756757,
"grad_norm": 0.2083408534526825,
"learning_rate": 0.000194824879084405,
"loss": 1.1446,
"step": 570
},
{
"epoch": 1.9425675675675675,
"grad_norm": 0.2556820213794708,
"learning_rate": 0.00019463600492698296,
"loss": 1.1372,
"step": 575
},
{
"epoch": 1.9594594594594594,
"grad_norm": 0.20939995348453522,
"learning_rate": 0.00019444384053808288,
"loss": 1.1421,
"step": 580
},
{
"epoch": 1.9763513513513513,
"grad_norm": 0.2339630275964737,
"learning_rate": 0.00019424839259872778,
"loss": 1.1421,
"step": 585
},
{
"epoch": 1.9932432432432432,
"grad_norm": 0.3135931193828583,
"learning_rate": 0.00019404966790410047,
"loss": 1.1339,
"step": 590
},
{
"epoch": 2.0,
"eval_loss": 1.6897428035736084,
"eval_runtime": 0.3945,
"eval_samples_per_second": 5.07,
"eval_steps_per_second": 2.535,
"step": 592
},
{
"epoch": 2.010135135135135,
"grad_norm": 0.2158200591802597,
"learning_rate": 0.0001938476733633076,
"loss": 1.0977,
"step": 595
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.22781264781951904,
"learning_rate": 0.00019364241599913924,
"loss": 1.0711,
"step": 600
},
{
"epoch": 2.043918918918919,
"grad_norm": 0.24521173536777496,
"learning_rate": 0.0001934339029478248,
"loss": 1.0767,
"step": 605
},
{
"epoch": 2.060810810810811,
"grad_norm": 0.21851304173469543,
"learning_rate": 0.00019322214145878487,
"loss": 1.0549,
"step": 610
},
{
"epoch": 2.0777027027027026,
"grad_norm": 0.21393460035324097,
"learning_rate": 0.00019300713889437926,
"loss": 1.068,
"step": 615
},
{
"epoch": 2.0945945945945947,
"grad_norm": 0.23508517444133759,
"learning_rate": 0.00019278890272965096,
"loss": 1.0776,
"step": 620
},
{
"epoch": 2.1114864864864864,
"grad_norm": 0.2709183990955353,
"learning_rate": 0.00019256744055206622,
"loss": 1.0867,
"step": 625
},
{
"epoch": 2.1283783783783785,
"grad_norm": 0.22891944646835327,
"learning_rate": 0.000192342760061251,
"loss": 1.0719,
"step": 630
},
{
"epoch": 2.14527027027027,
"grad_norm": 0.24709245562553406,
"learning_rate": 0.0001921148690687228,
"loss": 1.0687,
"step": 635
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.2254343330860138,
"learning_rate": 0.00019188377549761963,
"loss": 1.0687,
"step": 640
},
{
"epoch": 2.179054054054054,
"grad_norm": 0.22168201208114624,
"learning_rate": 0.00019164948738242409,
"loss": 1.0765,
"step": 645
},
{
"epoch": 2.195945945945946,
"grad_norm": 0.23680733144283295,
"learning_rate": 0.00019141201286868435,
"loss": 1.0741,
"step": 650
},
{
"epoch": 2.2128378378378377,
"grad_norm": 0.23159544169902802,
"learning_rate": 0.00019117136021273075,
"loss": 1.0795,
"step": 655
},
{
"epoch": 2.22972972972973,
"grad_norm": 0.23217150568962097,
"learning_rate": 0.00019092753778138886,
"loss": 1.0947,
"step": 660
},
{
"epoch": 2.2466216216216215,
"grad_norm": 0.22594888508319855,
"learning_rate": 0.0001906805540516885,
"loss": 1.059,
"step": 665
},
{
"epoch": 2.2635135135135136,
"grad_norm": 0.23356075584888458,
"learning_rate": 0.00019043041761056907,
"loss": 1.084,
"step": 670
},
{
"epoch": 2.2804054054054053,
"grad_norm": 0.21952542662620544,
"learning_rate": 0.0001901771371545811,
"loss": 1.0807,
"step": 675
},
{
"epoch": 2.2972972972972974,
"grad_norm": 0.21846647560596466,
"learning_rate": 0.00018992072148958368,
"loss": 1.0878,
"step": 680
},
{
"epoch": 2.314189189189189,
"grad_norm": 0.23093639314174652,
"learning_rate": 0.00018966117953043852,
"loss": 1.074,
"step": 685
},
{
"epoch": 2.331081081081081,
"grad_norm": 0.224954292178154,
"learning_rate": 0.00018939852030069981,
"loss": 1.0784,
"step": 690
},
{
"epoch": 2.347972972972973,
"grad_norm": 0.2606515884399414,
"learning_rate": 0.00018913275293230069,
"loss": 1.0757,
"step": 695
},
{
"epoch": 2.364864864864865,
"grad_norm": 0.2542010247707367,
"learning_rate": 0.0001888638866652356,
"loss": 1.0705,
"step": 700
},
{
"epoch": 2.3817567567567566,
"grad_norm": 0.2348444014787674,
"learning_rate": 0.00018859193084723913,
"loss": 1.0857,
"step": 705
},
{
"epoch": 2.3986486486486487,
"grad_norm": 0.2732667922973633,
"learning_rate": 0.00018831689493346095,
"loss": 1.073,
"step": 710
},
{
"epoch": 2.4155405405405403,
"grad_norm": 0.24476487934589386,
"learning_rate": 0.00018803878848613716,
"loss": 1.0862,
"step": 715
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.25073671340942383,
"learning_rate": 0.00018775762117425777,
"loss": 1.0699,
"step": 720
},
{
"epoch": 2.4493243243243246,
"grad_norm": 0.23084624111652374,
"learning_rate": 0.0001874734027732306,
"loss": 1.0827,
"step": 725
},
{
"epoch": 2.4662162162162162,
"grad_norm": 0.2258080244064331,
"learning_rate": 0.00018718614316454133,
"loss": 1.088,
"step": 730
},
{
"epoch": 2.483108108108108,
"grad_norm": 0.23056402802467346,
"learning_rate": 0.00018689585233541003,
"loss": 1.0698,
"step": 735
},
{
"epoch": 2.5,
"grad_norm": 0.22269397974014282,
"learning_rate": 0.00018660254037844388,
"loss": 1.0666,
"step": 740
},
{
"epoch": 2.516891891891892,
"grad_norm": 0.21295320987701416,
"learning_rate": 0.0001863062174912863,
"loss": 1.0781,
"step": 745
},
{
"epoch": 2.5337837837837838,
"grad_norm": 0.21225321292877197,
"learning_rate": 0.00018600689397626246,
"loss": 1.0724,
"step": 750
},
{
"epoch": 2.5506756756756754,
"grad_norm": 0.22661367058753967,
"learning_rate": 0.00018570458024002093,
"loss": 1.0792,
"step": 755
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.22279423475265503,
"learning_rate": 0.0001853992867931721,
"loss": 1.082,
"step": 760
},
{
"epoch": 2.5844594594594597,
"grad_norm": 0.22243249416351318,
"learning_rate": 0.0001850910242499225,
"loss": 1.0662,
"step": 765
},
{
"epoch": 2.6013513513513513,
"grad_norm": 0.22147369384765625,
"learning_rate": 0.00018477980332770607,
"loss": 1.0718,
"step": 770
},
{
"epoch": 2.618243243243243,
"grad_norm": 0.2354060411453247,
"learning_rate": 0.00018446563484681127,
"loss": 1.09,
"step": 775
},
{
"epoch": 2.635135135135135,
"grad_norm": 0.24088838696479797,
"learning_rate": 0.00018414852973000503,
"loss": 1.0897,
"step": 780
},
{
"epoch": 2.652027027027027,
"grad_norm": 0.2794990539550781,
"learning_rate": 0.00018382849900215294,
"loss": 1.0804,
"step": 785
},
{
"epoch": 2.668918918918919,
"grad_norm": 0.25418001413345337,
"learning_rate": 0.00018350555378983608,
"loss": 1.0729,
"step": 790
},
{
"epoch": 2.685810810810811,
"grad_norm": 0.2769224941730499,
"learning_rate": 0.0001831797053209639,
"loss": 1.0812,
"step": 795
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.2639266550540924,
"learning_rate": 0.00018285096492438424,
"loss": 1.0841,
"step": 800
},
{
"epoch": 2.7195945945945947,
"grad_norm": 0.21467705070972443,
"learning_rate": 0.000182519344029489,
"loss": 1.0852,
"step": 805
},
{
"epoch": 2.7364864864864864,
"grad_norm": 0.22124196588993073,
"learning_rate": 0.00018218485416581726,
"loss": 1.0849,
"step": 810
},
{
"epoch": 2.7533783783783785,
"grad_norm": 0.21145068109035492,
"learning_rate": 0.00018184750696265408,
"loss": 1.0706,
"step": 815
},
{
"epoch": 2.77027027027027,
"grad_norm": 0.22575508058071136,
"learning_rate": 0.00018150731414862622,
"loss": 1.0737,
"step": 820
},
{
"epoch": 2.7871621621621623,
"grad_norm": 0.22897441685199738,
"learning_rate": 0.00018116428755129459,
"loss": 1.076,
"step": 825
},
{
"epoch": 2.804054054054054,
"grad_norm": 0.224187970161438,
"learning_rate": 0.00018081843909674276,
"loss": 1.075,
"step": 830
},
{
"epoch": 2.820945945945946,
"grad_norm": 0.22817817330360413,
"learning_rate": 0.00018046978080916252,
"loss": 1.0802,
"step": 835
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.23358392715454102,
"learning_rate": 0.00018011832481043576,
"loss": 1.073,
"step": 840
},
{
"epoch": 2.85472972972973,
"grad_norm": 0.2256878912448883,
"learning_rate": 0.00017976408331971298,
"loss": 1.0712,
"step": 845
},
{
"epoch": 2.8716216216216215,
"grad_norm": 0.2276696115732193,
"learning_rate": 0.0001794070686529886,
"loss": 1.0888,
"step": 850
},
{
"epoch": 2.8885135135135136,
"grad_norm": 0.2123207151889801,
"learning_rate": 0.00017904729322267256,
"loss": 1.0856,
"step": 855
},
{
"epoch": 2.9054054054054053,
"grad_norm": 0.2253648340702057,
"learning_rate": 0.000178684769537159,
"loss": 1.0769,
"step": 860
},
{
"epoch": 2.9222972972972974,
"grad_norm": 0.23328694701194763,
"learning_rate": 0.00017831951020039126,
"loss": 1.0805,
"step": 865
},
{
"epoch": 2.939189189189189,
"grad_norm": 0.2189178615808487,
"learning_rate": 0.0001779515279114236,
"loss": 1.083,
"step": 870
},
{
"epoch": 2.956081081081081,
"grad_norm": 0.21634751558303833,
"learning_rate": 0.0001775808354639799,
"loss": 1.0777,
"step": 875
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.22920973598957062,
"learning_rate": 0.00017720744574600863,
"loss": 1.0622,
"step": 880
},
{
"epoch": 2.989864864864865,
"grad_norm": 0.23738548159599304,
"learning_rate": 0.00017683137173923495,
"loss": 1.0779,
"step": 885
},
{
"epoch": 3.0,
"eval_loss": 1.7535914182662964,
"eval_runtime": 0.3941,
"eval_samples_per_second": 5.075,
"eval_steps_per_second": 2.537,
"step": 888
},
{
"epoch": 3.0067567567567566,
"grad_norm": 0.22737225890159607,
"learning_rate": 0.00017645262651870926,
"loss": 1.0427,
"step": 890
},
{
"epoch": 3.0236486486486487,
"grad_norm": 0.2775098383426666,
"learning_rate": 0.00017607122325235267,
"loss": 0.9853,
"step": 895
},
{
"epoch": 3.0405405405405403,
"grad_norm": 0.2837352752685547,
"learning_rate": 0.0001756871752004992,
"loss": 0.9753,
"step": 900
},
{
"epoch": 3.0574324324324325,
"grad_norm": 0.25329145789146423,
"learning_rate": 0.00017530049571543464,
"loss": 0.9845,
"step": 905
},
{
"epoch": 3.074324324324324,
"grad_norm": 0.2581470310688019,
"learning_rate": 0.0001749111982409325,
"loss": 0.974,
"step": 910
},
{
"epoch": 3.0912162162162162,
"grad_norm": 0.2744286358356476,
"learning_rate": 0.00017451929631178648,
"loss": 0.9777,
"step": 915
},
{
"epoch": 3.108108108108108,
"grad_norm": 0.2783578038215637,
"learning_rate": 0.00017412480355334005,
"loss": 0.9883,
"step": 920
},
{
"epoch": 3.125,
"grad_norm": 0.27584517002105713,
"learning_rate": 0.0001737277336810124,
"loss": 0.98,
"step": 925
},
{
"epoch": 3.141891891891892,
"grad_norm": 0.26467305421829224,
"learning_rate": 0.00017332810049982208,
"loss": 0.9956,
"step": 930
},
{
"epoch": 3.1587837837837838,
"grad_norm": 0.25240039825439453,
"learning_rate": 0.00017292591790390665,
"loss": 0.9933,
"step": 935
},
{
"epoch": 3.175675675675676,
"grad_norm": 0.24769380688667297,
"learning_rate": 0.00017252119987603973,
"loss": 0.9742,
"step": 940
},
{
"epoch": 3.1925675675675675,
"grad_norm": 0.27298596501350403,
"learning_rate": 0.00017211396048714498,
"loss": 0.9866,
"step": 945
},
{
"epoch": 3.2094594594594597,
"grad_norm": 0.2657850682735443,
"learning_rate": 0.00017170421389580667,
"loss": 0.99,
"step": 950
},
{
"epoch": 3.2263513513513513,
"grad_norm": 0.23783531785011292,
"learning_rate": 0.00017129197434777763,
"loss": 0.9891,
"step": 955
},
{
"epoch": 3.2432432432432434,
"grad_norm": 0.24934813380241394,
"learning_rate": 0.00017087725617548385,
"loss": 0.9986,
"step": 960
},
{
"epoch": 3.260135135135135,
"grad_norm": 0.265461802482605,
"learning_rate": 0.0001704600737975262,
"loss": 0.977,
"step": 965
},
{
"epoch": 3.277027027027027,
"grad_norm": 0.26984909176826477,
"learning_rate": 0.00017004044171817925,
"loss": 1.0041,
"step": 970
},
{
"epoch": 3.293918918918919,
"grad_norm": 0.26064538955688477,
"learning_rate": 0.00016961837452688676,
"loss": 1.0007,
"step": 975
},
{
"epoch": 3.310810810810811,
"grad_norm": 0.253579705953598,
"learning_rate": 0.00016919388689775464,
"loss": 1.0069,
"step": 980
},
{
"epoch": 3.3277027027027026,
"grad_norm": 0.26410114765167236,
"learning_rate": 0.00016876699358904068,
"loss": 1.004,
"step": 985
},
{
"epoch": 3.3445945945945947,
"grad_norm": 0.2758503556251526,
"learning_rate": 0.00016833770944264153,
"loss": 1.0048,
"step": 990
},
{
"epoch": 3.3614864864864864,
"grad_norm": 0.2595711648464203,
"learning_rate": 0.00016790604938357663,
"loss": 0.9929,
"step": 995
},
{
"epoch": 3.3783783783783785,
"grad_norm": 0.26039746403694153,
"learning_rate": 0.00016747202841946928,
"loss": 1.0006,
"step": 1000
},
{
"epoch": 3.39527027027027,
"grad_norm": 0.25514382123947144,
"learning_rate": 0.0001670356616400249,
"loss": 1.012,
"step": 1005
},
{
"epoch": 3.4121621621621623,
"grad_norm": 0.26591041684150696,
"learning_rate": 0.00016659696421650645,
"loss": 1.0039,
"step": 1010
},
{
"epoch": 3.429054054054054,
"grad_norm": 0.26443612575531006,
"learning_rate": 0.00016615595140120686,
"loss": 0.9982,
"step": 1015
},
{
"epoch": 3.445945945945946,
"grad_norm": 0.2647687792778015,
"learning_rate": 0.00016571263852691888,
"loss": 1.0028,
"step": 1020
},
{
"epoch": 3.4628378378378377,
"grad_norm": 0.2620026767253876,
"learning_rate": 0.0001652670410064019,
"loss": 0.9951,
"step": 1025
},
{
"epoch": 3.47972972972973,
"grad_norm": 0.2619130313396454,
"learning_rate": 0.00016481917433184607,
"loss": 0.9882,
"step": 1030
},
{
"epoch": 3.4966216216216215,
"grad_norm": 0.24988499283790588,
"learning_rate": 0.0001643690540743339,
"loss": 0.9958,
"step": 1035
},
{
"epoch": 3.5135135135135136,
"grad_norm": 0.2864786982536316,
"learning_rate": 0.0001639166958832985,
"loss": 1.0017,
"step": 1040
},
{
"epoch": 3.5304054054054053,
"grad_norm": 0.2665320038795471,
"learning_rate": 0.00016346211548597995,
"loss": 0.9994,
"step": 1045
},
{
"epoch": 3.5472972972972974,
"grad_norm": 0.2629227936267853,
"learning_rate": 0.00016300532868687806,
"loss": 1.007,
"step": 1050
},
{
"epoch": 3.564189189189189,
"grad_norm": 0.25602978467941284,
"learning_rate": 0.00016254635136720328,
"loss": 1.0057,
"step": 1055
},
{
"epoch": 3.581081081081081,
"grad_norm": 0.2551196813583374,
"learning_rate": 0.0001620851994843244,
"loss": 0.9972,
"step": 1060
},
{
"epoch": 3.597972972972973,
"grad_norm": 0.27250906825065613,
"learning_rate": 0.00016162188907121354,
"loss": 1.0075,
"step": 1065
},
{
"epoch": 3.614864864864865,
"grad_norm": 0.2675882577896118,
"learning_rate": 0.00016115643623588915,
"loss": 1.0103,
"step": 1070
},
{
"epoch": 3.631756756756757,
"grad_norm": 0.2731866240501404,
"learning_rate": 0.00016068885716085567,
"loss": 1.0016,
"step": 1075
},
{
"epoch": 3.6486486486486487,
"grad_norm": 0.249202698469162,
"learning_rate": 0.00016021916810254097,
"loss": 1.0086,
"step": 1080
},
{
"epoch": 3.6655405405405403,
"grad_norm": 0.2600172460079193,
"learning_rate": 0.00015974738539073125,
"loss": 1.0032,
"step": 1085
},
{
"epoch": 3.6824324324324325,
"grad_norm": 0.2564319372177124,
"learning_rate": 0.00015927352542800317,
"loss": 1.0087,
"step": 1090
},
{
"epoch": 3.6993243243243246,
"grad_norm": 0.25873422622680664,
"learning_rate": 0.00015879760468915372,
"loss": 1.0006,
"step": 1095
},
{
"epoch": 3.7162162162162162,
"grad_norm": 0.2660174071788788,
"learning_rate": 0.00015831963972062733,
"loss": 0.988,
"step": 1100
},
{
"epoch": 3.733108108108108,
"grad_norm": 0.26095345616340637,
"learning_rate": 0.0001578396471399406,
"loss": 1.0109,
"step": 1105
},
{
"epoch": 3.75,
"grad_norm": 0.2525663673877716,
"learning_rate": 0.0001573576436351046,
"loss": 1.001,
"step": 1110
},
{
"epoch": 3.766891891891892,
"grad_norm": 0.2541150152683258,
"learning_rate": 0.0001568736459640447,
"loss": 0.9995,
"step": 1115
},
{
"epoch": 3.7837837837837838,
"grad_norm": 0.2548198997974396,
"learning_rate": 0.0001563876709540178,
"loss": 1.007,
"step": 1120
},
{
"epoch": 3.8006756756756754,
"grad_norm": 0.26351451873779297,
"learning_rate": 0.00015589973550102747,
"loss": 1.0056,
"step": 1125
},
{
"epoch": 3.8175675675675675,
"grad_norm": 0.2661518454551697,
"learning_rate": 0.00015540985656923645,
"loss": 1.0159,
"step": 1130
},
{
"epoch": 3.8344594594594597,
"grad_norm": 0.2599773406982422,
"learning_rate": 0.00015491805119037684,
"loss": 1.0102,
"step": 1135
},
{
"epoch": 3.8513513513513513,
"grad_norm": 0.2605207562446594,
"learning_rate": 0.0001544243364631579,
"loss": 1.009,
"step": 1140
},
{
"epoch": 3.868243243243243,
"grad_norm": 0.2640506625175476,
"learning_rate": 0.00015392872955267175,
"loss": 1.0125,
"step": 1145
},
{
"epoch": 3.885135135135135,
"grad_norm": 0.29407069087028503,
"learning_rate": 0.00015343124768979637,
"loss": 1.0107,
"step": 1150
},
{
"epoch": 3.902027027027027,
"grad_norm": 0.2638514041900635,
"learning_rate": 0.00015293190817059667,
"loss": 1.0046,
"step": 1155
},
{
"epoch": 3.918918918918919,
"grad_norm": 0.26569753885269165,
"learning_rate": 0.00015243072835572318,
"loss": 0.9985,
"step": 1160
},
{
"epoch": 3.935810810810811,
"grad_norm": 0.24786274135112762,
"learning_rate": 0.0001519277256698083,
"loss": 1.0086,
"step": 1165
},
{
"epoch": 3.9527027027027026,
"grad_norm": 0.27254632115364075,
"learning_rate": 0.0001514229176008607,
"loss": 1.0048,
"step": 1170
},
{
"epoch": 3.9695945945945947,
"grad_norm": 0.26518264412879944,
"learning_rate": 0.0001509163216996572,
"loss": 1.0014,
"step": 1175
},
{
"epoch": 3.9864864864864864,
"grad_norm": 0.24938583374023438,
"learning_rate": 0.00015040795557913245,
"loss": 1.0043,
"step": 1180
},
{
"epoch": 4.0,
"eval_loss": 1.8225109577178955,
"eval_runtime": 0.3942,
"eval_samples_per_second": 5.073,
"eval_steps_per_second": 2.537,
"step": 1184
},
{
"epoch": 4.003378378378378,
"grad_norm": 0.41594985127449036,
"learning_rate": 0.00014989783691376696,
"loss": 0.9886,
"step": 1185
},
{
"epoch": 4.02027027027027,
"grad_norm": 0.332119345664978,
"learning_rate": 0.00014938598343897214,
"loss": 0.8971,
"step": 1190
},
{
"epoch": 4.037162162162162,
"grad_norm": 0.2723919749259949,
"learning_rate": 0.000148872412950474,
"loss": 0.9054,
"step": 1195
},
{
"epoch": 4.054054054054054,
"grad_norm": 0.3006138801574707,
"learning_rate": 0.00014835714330369446,
"loss": 0.8955,
"step": 1200
},
{
"epoch": 4.070945945945946,
"grad_norm": 0.3039803206920624,
"learning_rate": 0.00014784019241313026,
"loss": 0.8937,
"step": 1205
},
{
"epoch": 4.087837837837838,
"grad_norm": 0.2896163761615753,
"learning_rate": 0.00014732157825173044,
"loss": 0.8998,
"step": 1210
},
{
"epoch": 4.10472972972973,
"grad_norm": 0.2962886095046997,
"learning_rate": 0.00014680131885027141,
"loss": 0.9087,
"step": 1215
},
{
"epoch": 4.121621621621622,
"grad_norm": 0.2953561246395111,
"learning_rate": 0.0001462794322967299,
"loss": 0.9078,
"step": 1220
},
{
"epoch": 4.138513513513513,
"grad_norm": 0.2991558015346527,
"learning_rate": 0.00014575593673565426,
"loss": 0.9004,
"step": 1225
},
{
"epoch": 4.155405405405405,
"grad_norm": 0.32434654235839844,
"learning_rate": 0.00014523085036753354,
"loss": 0.8972,
"step": 1230
},
{
"epoch": 4.172297297297297,
"grad_norm": 0.29733654856681824,
"learning_rate": 0.00014470419144816483,
"loss": 0.905,
"step": 1235
},
{
"epoch": 4.1891891891891895,
"grad_norm": 0.2878667116165161,
"learning_rate": 0.00014417597828801832,
"loss": 0.9037,
"step": 1240
},
{
"epoch": 4.206081081081081,
"grad_norm": 0.3089180886745453,
"learning_rate": 0.00014364622925160098,
"loss": 0.9004,
"step": 1245
},
{
"epoch": 4.222972972972973,
"grad_norm": 0.29691433906555176,
"learning_rate": 0.00014311496275681783,
"loss": 0.9105,
"step": 1250
},
{
"epoch": 4.239864864864865,
"grad_norm": 0.31907522678375244,
"learning_rate": 0.0001425821972743318,
"loss": 0.9051,
"step": 1255
},
{
"epoch": 4.256756756756757,
"grad_norm": 0.3177861273288727,
"learning_rate": 0.00014204795132692144,
"loss": 0.9059,
"step": 1260
},
{
"epoch": 4.273648648648648,
"grad_norm": 0.3413095474243164,
"learning_rate": 0.00014151224348883692,
"loss": 0.9068,
"step": 1265
},
{
"epoch": 4.29054054054054,
"grad_norm": 0.31278854608535767,
"learning_rate": 0.00014097509238515432,
"loss": 0.9178,
"step": 1270
},
{
"epoch": 4.3074324324324325,
"grad_norm": 0.3215930461883545,
"learning_rate": 0.00014043651669112808,
"loss": 0.9048,
"step": 1275
},
{
"epoch": 4.324324324324325,
"grad_norm": 0.32147011160850525,
"learning_rate": 0.00013989653513154165,
"loss": 0.9182,
"step": 1280
},
{
"epoch": 4.341216216216216,
"grad_norm": 0.30455154180526733,
"learning_rate": 0.0001393551664800566,
"loss": 0.9159,
"step": 1285
},
{
"epoch": 4.358108108108108,
"grad_norm": 0.310214638710022,
"learning_rate": 0.00013881242955855974,
"loss": 0.9157,
"step": 1290
},
{
"epoch": 4.375,
"grad_norm": 0.3040444254875183,
"learning_rate": 0.000138268343236509,
"loss": 0.9136,
"step": 1295
},
{
"epoch": 4.391891891891892,
"grad_norm": 0.32138949632644653,
"learning_rate": 0.000137722926430277,
"loss": 0.9198,
"step": 1300
},
{
"epoch": 4.408783783783784,
"grad_norm": 0.3029273748397827,
"learning_rate": 0.00013717619810249378,
"loss": 0.9207,
"step": 1305
},
{
"epoch": 4.425675675675675,
"grad_norm": 0.3084327280521393,
"learning_rate": 0.00013662817726138728,
"loss": 0.9128,
"step": 1310
},
{
"epoch": 4.4425675675675675,
"grad_norm": 0.2980863153934479,
"learning_rate": 0.00013607888296012259,
"loss": 0.919,
"step": 1315
},
{
"epoch": 4.45945945945946,
"grad_norm": 0.3012111186981201,
"learning_rate": 0.00013552833429613938,
"loss": 0.913,
"step": 1320
},
{
"epoch": 4.476351351351352,
"grad_norm": 0.3067188262939453,
"learning_rate": 0.0001349765504104881,
"loss": 0.9098,
"step": 1325
},
{
"epoch": 4.493243243243243,
"grad_norm": 0.30859634280204773,
"learning_rate": 0.0001344235504871645,
"loss": 0.9103,
"step": 1330
},
{
"epoch": 4.510135135135135,
"grad_norm": 0.309527724981308,
"learning_rate": 0.00013386935375244246,
"loss": 0.9118,
"step": 1335
},
{
"epoch": 4.527027027027027,
"grad_norm": 0.29956597089767456,
"learning_rate": 0.00013331397947420576,
"loss": 0.9248,
"step": 1340
},
{
"epoch": 4.543918918918919,
"grad_norm": 0.30333107709884644,
"learning_rate": 0.00013275744696127805,
"loss": 0.9235,
"step": 1345
},
{
"epoch": 4.5608108108108105,
"grad_norm": 0.3010920584201813,
"learning_rate": 0.00013219977556275163,
"loss": 0.9204,
"step": 1350
},
{
"epoch": 4.577702702702703,
"grad_norm": 0.30947473645210266,
"learning_rate": 0.00013164098466731468,
"loss": 0.9244,
"step": 1355
},
{
"epoch": 4.594594594594595,
"grad_norm": 0.30661630630493164,
"learning_rate": 0.00013108109370257712,
"loss": 0.9177,
"step": 1360
},
{
"epoch": 4.611486486486487,
"grad_norm": 0.2866823971271515,
"learning_rate": 0.00013052012213439536,
"loss": 0.9107,
"step": 1365
},
{
"epoch": 4.628378378378378,
"grad_norm": 0.3211285471916199,
"learning_rate": 0.0001299580894661953,
"loss": 0.9242,
"step": 1370
},
{
"epoch": 4.64527027027027,
"grad_norm": 0.3097619414329529,
"learning_rate": 0.00012939501523829444,
"loss": 0.91,
"step": 1375
},
{
"epoch": 4.662162162162162,
"grad_norm": 0.30498236417770386,
"learning_rate": 0.0001288309190272222,
"loss": 0.9176,
"step": 1380
},
{
"epoch": 4.679054054054054,
"grad_norm": 0.31782612204551697,
"learning_rate": 0.00012826582044503978,
"loss": 0.91,
"step": 1385
},
{
"epoch": 4.695945945945946,
"grad_norm": 0.32527872920036316,
"learning_rate": 0.00012769973913865794,
"loss": 0.9119,
"step": 1390
},
{
"epoch": 4.712837837837838,
"grad_norm": 0.2965739369392395,
"learning_rate": 0.000127132694789154,
"loss": 0.9333,
"step": 1395
},
{
"epoch": 4.72972972972973,
"grad_norm": 0.31443119049072266,
"learning_rate": 0.00012656470711108764,
"loss": 0.9184,
"step": 1400
},
{
"epoch": 4.746621621621622,
"grad_norm": 0.30386343598365784,
"learning_rate": 0.00012599579585181552,
"loss": 0.912,
"step": 1405
},
{
"epoch": 4.763513513513513,
"grad_norm": 0.2971736788749695,
"learning_rate": 0.00012542598079080456,
"loss": 0.9115,
"step": 1410
},
{
"epoch": 4.780405405405405,
"grad_norm": 0.29560431838035583,
"learning_rate": 0.00012485528173894448,
"loss": 0.9176,
"step": 1415
},
{
"epoch": 4.797297297297297,
"grad_norm": 0.30718737840652466,
"learning_rate": 0.0001242837185378587,
"loss": 0.9184,
"step": 1420
},
{
"epoch": 4.8141891891891895,
"grad_norm": 0.29568740725517273,
"learning_rate": 0.00012371131105921504,
"loss": 0.9214,
"step": 1425
},
{
"epoch": 4.831081081081081,
"grad_norm": 0.32252946496009827,
"learning_rate": 0.00012313807920403419,
"loss": 0.9252,
"step": 1430
},
{
"epoch": 4.847972972972973,
"grad_norm": 0.31315141916275024,
"learning_rate": 0.00012256404290199825,
"loss": 0.9308,
"step": 1435
},
{
"epoch": 4.864864864864865,
"grad_norm": 0.3065871000289917,
"learning_rate": 0.00012198922211075778,
"loss": 0.9186,
"step": 1440
},
{
"epoch": 4.881756756756757,
"grad_norm": 0.31804540753364563,
"learning_rate": 0.00012141363681523776,
"loss": 0.9275,
"step": 1445
},
{
"epoch": 4.898648648648649,
"grad_norm": 0.313486784696579,
"learning_rate": 0.00012083730702694291,
"loss": 0.9315,
"step": 1450
},
{
"epoch": 4.91554054054054,
"grad_norm": 0.31312400102615356,
"learning_rate": 0.00012026025278326187,
"loss": 0.934,
"step": 1455
},
{
"epoch": 4.9324324324324325,
"grad_norm": 0.321845680475235,
"learning_rate": 0.00011968249414677055,
"loss": 0.9266,
"step": 1460
},
{
"epoch": 4.949324324324325,
"grad_norm": 0.29238423705101013,
"learning_rate": 0.00011910405120453476,
"loss": 0.9203,
"step": 1465
},
{
"epoch": 4.966216216216216,
"grad_norm": 0.30449482798576355,
"learning_rate": 0.00011852494406741165,
"loss": 0.9254,
"step": 1470
},
{
"epoch": 4.983108108108108,
"grad_norm": 0.3126208186149597,
"learning_rate": 0.00011794519286935055,
"loss": 0.9181,
"step": 1475
},
{
"epoch": 5.0,
"grad_norm": 0.29170361161231995,
"learning_rate": 0.00011736481776669306,
"loss": 0.9288,
"step": 1480
},
{
"epoch": 5.0,
"eval_loss": 2.0044448375701904,
"eval_runtime": 0.3932,
"eval_samples_per_second": 5.087,
"eval_steps_per_second": 2.543,
"step": 1480
},
{
"epoch": 5.016891891891892,
"grad_norm": 0.46076056361198425,
"learning_rate": 0.0001167838389374722,
"loss": 0.8221,
"step": 1485
},
{
"epoch": 5.033783783783784,
"grad_norm": 0.32739222049713135,
"learning_rate": 0.00011620227658071087,
"loss": 0.8178,
"step": 1490
},
{
"epoch": 5.050675675675675,
"grad_norm": 0.38803204894065857,
"learning_rate": 0.00011562015091571963,
"loss": 0.8143,
"step": 1495
},
{
"epoch": 5.0675675675675675,
"grad_norm": 0.32274121046066284,
"learning_rate": 0.00011503748218139369,
"loss": 0.821,
"step": 1500
},
{
"epoch": 5.08445945945946,
"grad_norm": 0.3647359013557434,
"learning_rate": 0.00011445429063550926,
"loss": 0.8265,
"step": 1505
},
{
"epoch": 5.101351351351352,
"grad_norm": 0.36681613326072693,
"learning_rate": 0.00011387059655401932,
"loss": 0.8248,
"step": 1510
},
{
"epoch": 5.118243243243243,
"grad_norm": 0.35085347294807434,
"learning_rate": 0.00011328642023034857,
"loss": 0.823,
"step": 1515
},
{
"epoch": 5.135135135135135,
"grad_norm": 0.3212147653102875,
"learning_rate": 0.00011270178197468789,
"loss": 0.8265,
"step": 1520
},
{
"epoch": 5.152027027027027,
"grad_norm": 0.35389629006385803,
"learning_rate": 0.00011211670211328833,
"loss": 0.8252,
"step": 1525
},
{
"epoch": 5.168918918918919,
"grad_norm": 0.350277841091156,
"learning_rate": 0.00011153120098775434,
"loss": 0.8193,
"step": 1530
},
{
"epoch": 5.1858108108108105,
"grad_norm": 0.35216981172561646,
"learning_rate": 0.00011094529895433652,
"loss": 0.8291,
"step": 1535
},
{
"epoch": 5.202702702702703,
"grad_norm": 0.33077818155288696,
"learning_rate": 0.00011035901638322392,
"loss": 0.8145,
"step": 1540
},
{
"epoch": 5.219594594594595,
"grad_norm": 0.34553956985473633,
"learning_rate": 0.0001097723736578359,
"loss": 0.8297,
"step": 1545
},
{
"epoch": 5.236486486486487,
"grad_norm": 0.349026083946228,
"learning_rate": 0.00010918539117411333,
"loss": 0.8363,
"step": 1550
},
{
"epoch": 5.253378378378378,
"grad_norm": 0.34249648451805115,
"learning_rate": 0.00010859808933980948,
"loss": 0.8228,
"step": 1555
},
{
"epoch": 5.27027027027027,
"grad_norm": 0.3591874837875366,
"learning_rate": 0.00010801048857378071,
"loss": 0.8272,
"step": 1560
},
{
"epoch": 5.287162162162162,
"grad_norm": 0.3266925513744354,
"learning_rate": 0.00010742260930527625,
"loss": 0.8264,
"step": 1565
},
{
"epoch": 5.304054054054054,
"grad_norm": 0.3557049632072449,
"learning_rate": 0.00010683447197322817,
"loss": 0.8327,
"step": 1570
},
{
"epoch": 5.320945945945946,
"grad_norm": 0.34309855103492737,
"learning_rate": 0.00010624609702554069,
"loss": 0.8362,
"step": 1575
},
{
"epoch": 5.337837837837838,
"grad_norm": 0.33597272634506226,
"learning_rate": 0.00010565750491837925,
"loss": 0.8274,
"step": 1580
},
{
"epoch": 5.35472972972973,
"grad_norm": 0.33070334792137146,
"learning_rate": 0.0001050687161154593,
"loss": 0.8309,
"step": 1585
},
{
"epoch": 5.371621621621622,
"grad_norm": 0.34598931670188904,
"learning_rate": 0.00010447975108733492,
"loss": 0.846,
"step": 1590
},
{
"epoch": 5.388513513513513,
"grad_norm": 0.3528457283973694,
"learning_rate": 0.00010389063031068698,
"loss": 0.8199,
"step": 1595
},
{
"epoch": 5.405405405405405,
"grad_norm": 0.3506796956062317,
"learning_rate": 0.00010330137426761135,
"loss": 0.8377,
"step": 1600
},
{
"epoch": 5.422297297297297,
"grad_norm": 0.35415780544281006,
"learning_rate": 0.00010271200344490674,
"loss": 0.8357,
"step": 1605
},
{
"epoch": 5.4391891891891895,
"grad_norm": 0.33977410197257996,
"learning_rate": 0.00010212253833336237,
"loss": 0.8273,
"step": 1610
},
{
"epoch": 5.456081081081081,
"grad_norm": 0.3760969638824463,
"learning_rate": 0.00010153299942704566,
"loss": 0.8404,
"step": 1615
},
{
"epoch": 5.472972972972973,
"grad_norm": 0.3504043519496918,
"learning_rate": 0.00010094340722258969,
"loss": 0.8368,
"step": 1620
},
{
"epoch": 5.489864864864865,
"grad_norm": 0.3397385776042938,
"learning_rate": 0.00010035378221848053,
"loss": 0.8327,
"step": 1625
},
{
"epoch": 5.506756756756757,
"grad_norm": 0.33861246705055237,
"learning_rate": 9.976414491434463e-05,
"loss": 0.8419,
"step": 1630
},
{
"epoch": 5.523648648648649,
"grad_norm": 0.3566323220729828,
"learning_rate": 9.917451581023607e-05,
"loss": 0.8366,
"step": 1635
},
{
"epoch": 5.54054054054054,
"grad_norm": 0.3398774266242981,
"learning_rate": 9.858491540592382e-05,
"loss": 0.8306,
"step": 1640
},
{
"epoch": 5.5574324324324325,
"grad_norm": 0.3483969271183014,
"learning_rate": 9.799536420017906e-05,
"loss": 0.8333,
"step": 1645
},
{
"epoch": 5.574324324324325,
"grad_norm": 0.34190595149993896,
"learning_rate": 9.740588269006246e-05,
"loss": 0.838,
"step": 1650
},
{
"epoch": 5.591216216216216,
"grad_norm": 0.35382217168807983,
"learning_rate": 9.681649137021158e-05,
"loss": 0.8489,
"step": 1655
},
{
"epoch": 5.608108108108108,
"grad_norm": 0.3321906328201294,
"learning_rate": 9.622721073212832e-05,
"loss": 0.8364,
"step": 1660
},
{
"epoch": 5.625,
"grad_norm": 0.34170404076576233,
"learning_rate": 9.563806126346642e-05,
"loss": 0.841,
"step": 1665
},
{
"epoch": 5.641891891891892,
"grad_norm": 0.34292900562286377,
"learning_rate": 9.504906344731932e-05,
"loss": 0.8366,
"step": 1670
},
{
"epoch": 5.658783783783784,
"grad_norm": 0.35314562916755676,
"learning_rate": 9.446023776150787e-05,
"loss": 0.838,
"step": 1675
},
{
"epoch": 5.675675675675675,
"grad_norm": 0.3411477506160736,
"learning_rate": 9.38716046778684e-05,
"loss": 0.8441,
"step": 1680
},
{
"epoch": 5.6925675675675675,
"grad_norm": 0.3432328701019287,
"learning_rate": 9.328318466154102e-05,
"loss": 0.8459,
"step": 1685
},
{
"epoch": 5.70945945945946,
"grad_norm": 0.33872732520103455,
"learning_rate": 9.269499817025814e-05,
"loss": 0.8388,
"step": 1690
},
{
"epoch": 5.726351351351351,
"grad_norm": 0.34312689304351807,
"learning_rate": 9.210706565363305e-05,
"loss": 0.8332,
"step": 1695
},
{
"epoch": 5.743243243243243,
"grad_norm": 0.3369201123714447,
"learning_rate": 9.151940755244912e-05,
"loss": 0.831,
"step": 1700
},
{
"epoch": 5.760135135135135,
"grad_norm": 0.34367725253105164,
"learning_rate": 9.093204429794898e-05,
"loss": 0.8303,
"step": 1705
},
{
"epoch": 5.777027027027027,
"grad_norm": 0.3678775727748871,
"learning_rate": 9.034499631112437e-05,
"loss": 0.8413,
"step": 1710
},
{
"epoch": 5.793918918918919,
"grad_norm": 0.34643349051475525,
"learning_rate": 8.975828400200592e-05,
"loss": 0.845,
"step": 1715
},
{
"epoch": 5.8108108108108105,
"grad_norm": 0.35629916191101074,
"learning_rate": 8.917192776895382e-05,
"loss": 0.836,
"step": 1720
},
{
"epoch": 5.827702702702703,
"grad_norm": 0.3395968973636627,
"learning_rate": 8.858594799794835e-05,
"loss": 0.8384,
"step": 1725
},
{
"epoch": 5.844594594594595,
"grad_norm": 0.3399130403995514,
"learning_rate": 8.800036506188129e-05,
"loss": 0.841,
"step": 1730
},
{
"epoch": 5.861486486486487,
"grad_norm": 0.3563048541545868,
"learning_rate": 8.741519931984766e-05,
"loss": 0.8388,
"step": 1735
},
{
"epoch": 5.878378378378378,
"grad_norm": 0.34680601954460144,
"learning_rate": 8.683047111643763e-05,
"loss": 0.8368,
"step": 1740
},
{
"epoch": 5.89527027027027,
"grad_norm": 0.3650359511375427,
"learning_rate": 8.624620078102951e-05,
"loss": 0.8447,
"step": 1745
},
{
"epoch": 5.912162162162162,
"grad_norm": 0.34037554264068604,
"learning_rate": 8.566240862708274e-05,
"loss": 0.8355,
"step": 1750
},
{
"epoch": 5.929054054054054,
"grad_norm": 0.35734692215919495,
"learning_rate": 8.507911495143173e-05,
"loss": 0.8425,
"step": 1755
},
{
"epoch": 5.945945945945946,
"grad_norm": 0.3381343483924866,
"learning_rate": 8.449634003358022e-05,
"loss": 0.8418,
"step": 1760
},
{
"epoch": 5.962837837837838,
"grad_norm": 0.3489098846912384,
"learning_rate": 8.39141041349961e-05,
"loss": 0.847,
"step": 1765
},
{
"epoch": 5.97972972972973,
"grad_norm": 0.361604243516922,
"learning_rate": 8.33324274984071e-05,
"loss": 0.8428,
"step": 1770
},
{
"epoch": 5.996621621621622,
"grad_norm": 0.34529900550842285,
"learning_rate": 8.275133034709699e-05,
"loss": 0.8437,
"step": 1775
},
{
"epoch": 6.0,
"eval_loss": 2.170966863632202,
"eval_runtime": 0.3935,
"eval_samples_per_second": 5.083,
"eval_steps_per_second": 2.541,
"step": 1776
},
{
"epoch": 6.013513513513513,
"grad_norm": 0.3619636595249176,
"learning_rate": 8.217083288420241e-05,
"loss": 0.7823,
"step": 1780
},
{
"epoch": 6.030405405405405,
"grad_norm": 0.33571234345436096,
"learning_rate": 8.159095529201049e-05,
"loss": 0.7663,
"step": 1785
},
{
"epoch": 6.047297297297297,
"grad_norm": 0.3377952575683594,
"learning_rate": 8.101171773125716e-05,
"loss": 0.764,
"step": 1790
},
{
"epoch": 6.0641891891891895,
"grad_norm": 0.3851635754108429,
"learning_rate": 8.043314034042631e-05,
"loss": 0.7543,
"step": 1795
},
{
"epoch": 6.081081081081081,
"grad_norm": 0.3411933481693268,
"learning_rate": 7.985524323504948e-05,
"loss": 0.7569,
"step": 1800
},
{
"epoch": 6.097972972972973,
"grad_norm": 0.3682069480419159,
"learning_rate": 7.927804650700659e-05,
"loss": 0.7546,
"step": 1805
},
{
"epoch": 6.114864864864865,
"grad_norm": 0.35545244812965393,
"learning_rate": 7.870157022382735e-05,
"loss": 0.7615,
"step": 1810
},
{
"epoch": 6.131756756756757,
"grad_norm": 0.39011305570602417,
"learning_rate": 7.812583442799368e-05,
"loss": 0.7611,
"step": 1815
},
{
"epoch": 6.148648648648648,
"grad_norm": 0.33269399404525757,
"learning_rate": 7.755085913624274e-05,
"loss": 0.7599,
"step": 1820
},
{
"epoch": 6.16554054054054,
"grad_norm": 0.3615286946296692,
"learning_rate": 7.697666433887108e-05,
"loss": 0.7501,
"step": 1825
},
{
"epoch": 6.1824324324324325,
"grad_norm": 0.3396786153316498,
"learning_rate": 7.640326999903967e-05,
"loss": 0.757,
"step": 1830
},
{
"epoch": 6.199324324324325,
"grad_norm": 0.38157907128334045,
"learning_rate": 7.583069605207975e-05,
"loss": 0.7506,
"step": 1835
},
{
"epoch": 6.216216216216216,
"grad_norm": 0.3560575842857361,
"learning_rate": 7.525896240479976e-05,
"loss": 0.754,
"step": 1840
},
{
"epoch": 6.233108108108108,
"grad_norm": 0.3762560784816742,
"learning_rate": 7.468808893479327e-05,
"loss": 0.7614,
"step": 1845
},
{
"epoch": 6.25,
"grad_norm": 0.36987847089767456,
"learning_rate": 7.411809548974792e-05,
"loss": 0.7637,
"step": 1850
},
{
"epoch": 6.266891891891892,
"grad_norm": 0.406857967376709,
"learning_rate": 7.354900188675525e-05,
"loss": 0.761,
"step": 1855
},
{
"epoch": 6.283783783783784,
"grad_norm": 0.3850703835487366,
"learning_rate": 7.29808279116218e-05,
"loss": 0.7656,
"step": 1860
},
{
"epoch": 6.300675675675675,
"grad_norm": 0.34307488799095154,
"learning_rate": 7.24135933181812e-05,
"loss": 0.7501,
"step": 1865
},
{
"epoch": 6.3175675675675675,
"grad_norm": 0.3922889232635498,
"learning_rate": 7.184731782760746e-05,
"loss": 0.7584,
"step": 1870
},
{
"epoch": 6.33445945945946,
"grad_norm": 0.36379769444465637,
"learning_rate": 7.128202112772912e-05,
"loss": 0.7626,
"step": 1875
},
{
"epoch": 6.351351351351352,
"grad_norm": 0.3796177804470062,
"learning_rate": 7.071772287234497e-05,
"loss": 0.7739,
"step": 1880
},
{
"epoch": 6.368243243243243,
"grad_norm": 0.3752601146697998,
"learning_rate": 7.015444268054059e-05,
"loss": 0.7658,
"step": 1885
},
{
"epoch": 6.385135135135135,
"grad_norm": 0.3463265597820282,
"learning_rate": 6.959220013600641e-05,
"loss": 0.7584,
"step": 1890
},
{
"epoch": 6.402027027027027,
"grad_norm": 0.3532774746417999,
"learning_rate": 6.903101478635662e-05,
"loss": 0.7715,
"step": 1895
},
{
"epoch": 6.418918918918919,
"grad_norm": 0.3608658015727997,
"learning_rate": 6.847090614244977e-05,
"loss": 0.7682,
"step": 1900
},
{
"epoch": 6.4358108108108105,
"grad_norm": 0.39848268032073975,
"learning_rate": 6.791189367771025e-05,
"loss": 0.7658,
"step": 1905
},
{
"epoch": 6.452702702702703,
"grad_norm": 0.3448575437068939,
"learning_rate": 6.735399682745145e-05,
"loss": 0.7736,
"step": 1910
},
{
"epoch": 6.469594594594595,
"grad_norm": 0.3646429181098938,
"learning_rate": 6.679723498819986e-05,
"loss": 0.7657,
"step": 1915
},
{
"epoch": 6.486486486486487,
"grad_norm": 0.3576849699020386,
"learning_rate": 6.624162751702076e-05,
"loss": 0.7741,
"step": 1920
},
{
"epoch": 6.503378378378378,
"grad_norm": 0.3550150990486145,
"learning_rate": 6.568719373084538e-05,
"loss": 0.7636,
"step": 1925
},
{
"epoch": 6.52027027027027,
"grad_norm": 0.3779493570327759,
"learning_rate": 6.513395290579901e-05,
"loss": 0.7641,
"step": 1930
},
{
"epoch": 6.537162162162162,
"grad_norm": 0.36017805337905884,
"learning_rate": 6.458192427653112e-05,
"loss": 0.7676,
"step": 1935
},
{
"epoch": 6.554054054054054,
"grad_norm": 0.38434022665023804,
"learning_rate": 6.403112703554643e-05,
"loss": 0.7701,
"step": 1940
},
{
"epoch": 6.570945945945946,
"grad_norm": 0.358761191368103,
"learning_rate": 6.348158033253773e-05,
"loss": 0.7539,
"step": 1945
},
{
"epoch": 6.587837837837838,
"grad_norm": 0.37006473541259766,
"learning_rate": 6.293330327372005e-05,
"loss": 0.7767,
"step": 1950
},
{
"epoch": 6.60472972972973,
"grad_norm": 0.3721785247325897,
"learning_rate": 6.238631492116644e-05,
"loss": 0.7715,
"step": 1955
},
{
"epoch": 6.621621621621622,
"grad_norm": 0.3626702129840851,
"learning_rate": 6.184063429214515e-05,
"loss": 0.766,
"step": 1960
},
{
"epoch": 6.638513513513513,
"grad_norm": 0.37497058510780334,
"learning_rate": 6.129628035845861e-05,
"loss": 0.7658,
"step": 1965
},
{
"epoch": 6.655405405405405,
"grad_norm": 0.36465275287628174,
"learning_rate": 6.0753272045783625e-05,
"loss": 0.7666,
"step": 1970
},
{
"epoch": 6.672297297297297,
"grad_norm": 0.3648873567581177,
"learning_rate": 6.021162823301358e-05,
"loss": 0.7661,
"step": 1975
},
{
"epoch": 6.6891891891891895,
"grad_norm": 0.3486686646938324,
"learning_rate": 5.967136775160187e-05,
"loss": 0.7638,
"step": 1980
},
{
"epoch": 6.706081081081081,
"grad_norm": 0.36590924859046936,
"learning_rate": 5.913250938490744e-05,
"loss": 0.7753,
"step": 1985
},
{
"epoch": 6.722972972972973,
"grad_norm": 0.36060139536857605,
"learning_rate": 5.859507186754146e-05,
"loss": 0.778,
"step": 1990
},
{
"epoch": 6.739864864864865,
"grad_norm": 0.4011731743812561,
"learning_rate": 5.80590738847162e-05,
"loss": 0.7653,
"step": 1995
},
{
"epoch": 6.756756756756757,
"grad_norm": 0.38411641120910645,
"learning_rate": 5.752453407159522e-05,
"loss": 0.76,
"step": 2000
},
{
"epoch": 6.773648648648649,
"grad_norm": 0.37505170702934265,
"learning_rate": 5.699147101264566e-05,
"loss": 0.7709,
"step": 2005
},
{
"epoch": 6.79054054054054,
"grad_norm": 0.3904276192188263,
"learning_rate": 5.645990324099197e-05,
"loss": 0.7659,
"step": 2010
},
{
"epoch": 6.8074324324324325,
"grad_norm": 0.3751082420349121,
"learning_rate": 5.5929849237771556e-05,
"loss": 0.7564,
"step": 2015
},
{
"epoch": 6.824324324324325,
"grad_norm": 0.3594505488872528,
"learning_rate": 5.540132743149242e-05,
"loss": 0.7723,
"step": 2020
},
{
"epoch": 6.841216216216216,
"grad_norm": 0.3686336874961853,
"learning_rate": 5.487435619739214e-05,
"loss": 0.7645,
"step": 2025
},
{
"epoch": 6.858108108108108,
"grad_norm": 0.37959080934524536,
"learning_rate": 5.434895385679937e-05,
"loss": 0.761,
"step": 2030
},
{
"epoch": 6.875,
"grad_norm": 0.38148415088653564,
"learning_rate": 5.382513867649663e-05,
"loss": 0.766,
"step": 2035
},
{
"epoch": 6.891891891891892,
"grad_norm": 0.37155023217201233,
"learning_rate": 5.33029288680852e-05,
"loss": 0.7753,
"step": 2040
},
{
"epoch": 6.908783783783784,
"grad_norm": 0.3691665828227997,
"learning_rate": 5.2782342587352154e-05,
"loss": 0.7641,
"step": 2045
},
{
"epoch": 6.925675675675675,
"grad_norm": 0.4007939398288727,
"learning_rate": 5.226339793363898e-05,
"loss": 0.7717,
"step": 2050
},
{
"epoch": 6.9425675675675675,
"grad_norm": 0.36151981353759766,
"learning_rate": 5.174611294921224e-05,
"loss": 0.7832,
"step": 2055
},
{
"epoch": 6.95945945945946,
"grad_norm": 0.38270819187164307,
"learning_rate": 5.123050561863657e-05,
"loss": 0.7619,
"step": 2060
},
{
"epoch": 6.976351351351351,
"grad_norm": 0.35164088010787964,
"learning_rate": 5.071659386814907e-05,
"loss": 0.7725,
"step": 2065
},
{
"epoch": 6.993243243243243,
"grad_norm": 0.3853191137313843,
"learning_rate": 5.020439556503629e-05,
"loss": 0.7654,
"step": 2070
},
{
"epoch": 7.0,
"eval_loss": 2.40800142288208,
"eval_runtime": 0.394,
"eval_samples_per_second": 5.076,
"eval_steps_per_second": 2.538,
"step": 2072
},
{
"epoch": 7.010135135135135,
"grad_norm": 0.3015079200267792,
"learning_rate": 4.969392851701305e-05,
"loss": 0.7406,
"step": 2075
},
{
"epoch": 7.027027027027027,
"grad_norm": 0.47633570432662964,
"learning_rate": 4.918521047160308e-05,
"loss": 0.7101,
"step": 2080
},
{
"epoch": 7.043918918918919,
"grad_norm": 0.31147924065589905,
"learning_rate": 4.8678259115522215e-05,
"loss": 0.7144,
"step": 2085
},
{
"epoch": 7.0608108108108105,
"grad_norm": 0.3377055823802948,
"learning_rate": 4.817309207406346e-05,
"loss": 0.7091,
"step": 2090
},
{
"epoch": 7.077702702702703,
"grad_norm": 0.3804275393486023,
"learning_rate": 4.7669726910484e-05,
"loss": 0.7083,
"step": 2095
},
{
"epoch": 7.094594594594595,
"grad_norm": 0.3246239721775055,
"learning_rate": 4.716818112539485e-05,
"loss": 0.7076,
"step": 2100
},
{
"epoch": 7.111486486486487,
"grad_norm": 0.3758985996246338,
"learning_rate": 4.666847215615226e-05,
"loss": 0.7112,
"step": 2105
},
{
"epoch": 7.128378378378378,
"grad_norm": 0.3744657337665558,
"learning_rate": 4.617061737625139e-05,
"loss": 0.714,
"step": 2110
},
{
"epoch": 7.14527027027027,
"grad_norm": 0.35453036427497864,
"learning_rate": 4.567463409472255e-05,
"loss": 0.7144,
"step": 2115
},
{
"epoch": 7.162162162162162,
"grad_norm": 0.36035045981407166,
"learning_rate": 4.518053955552903e-05,
"loss": 0.7153,
"step": 2120
},
{
"epoch": 7.179054054054054,
"grad_norm": 0.362409383058548,
"learning_rate": 4.468835093696796e-05,
"loss": 0.7179,
"step": 2125
},
{
"epoch": 7.195945945945946,
"grad_norm": 0.4178987145423889,
"learning_rate": 4.419808535107287e-05,
"loss": 0.7109,
"step": 2130
},
{
"epoch": 7.212837837837838,
"grad_norm": 0.36226364970207214,
"learning_rate": 4.370975984301866e-05,
"loss": 0.7112,
"step": 2135
},
{
"epoch": 7.22972972972973,
"grad_norm": 0.34748539328575134,
"learning_rate": 4.322339139052921e-05,
"loss": 0.7115,
"step": 2140
},
{
"epoch": 7.246621621621622,
"grad_norm": 0.3634675443172455,
"learning_rate": 4.273899690328702e-05,
"loss": 0.7043,
"step": 2145
},
{
"epoch": 7.263513513513513,
"grad_norm": 0.3675166070461273,
"learning_rate": 4.2256593222345185e-05,
"loss": 0.7124,
"step": 2150
},
{
"epoch": 7.280405405405405,
"grad_norm": 0.33852246403694153,
"learning_rate": 4.177619711954211e-05,
"loss": 0.7122,
"step": 2155
},
{
"epoch": 7.297297297297297,
"grad_norm": 0.34997648000717163,
"learning_rate": 4.129782529691815e-05,
"loss": 0.7161,
"step": 2160
},
{
"epoch": 7.3141891891891895,
"grad_norm": 0.3947296738624573,
"learning_rate": 4.082149438613514e-05,
"loss": 0.715,
"step": 2165
},
{
"epoch": 7.331081081081081,
"grad_norm": 0.3766041696071625,
"learning_rate": 4.034722094789809e-05,
"loss": 0.7104,
"step": 2170
},
{
"epoch": 7.347972972972973,
"grad_norm": 0.39250659942626953,
"learning_rate": 3.987502147137928e-05,
"loss": 0.7157,
"step": 2175
},
{
"epoch": 7.364864864864865,
"grad_norm": 0.356827050447464,
"learning_rate": 3.9404912373645185e-05,
"loss": 0.7104,
"step": 2180
},
{
"epoch": 7.381756756756757,
"grad_norm": 0.3731355369091034,
"learning_rate": 3.893690999908562e-05,
"loss": 0.7167,
"step": 2185
},
{
"epoch": 7.398648648648648,
"grad_norm": 0.3654830753803253,
"learning_rate": 3.8471030618845375e-05,
"loss": 0.7151,
"step": 2190
},
{
"epoch": 7.41554054054054,
"grad_norm": 0.3466781675815582,
"learning_rate": 3.800729043025871e-05,
"loss": 0.7208,
"step": 2195
},
{
"epoch": 7.4324324324324325,
"grad_norm": 0.37476223707199097,
"learning_rate": 3.7545705556286126e-05,
"loss": 0.7083,
"step": 2200
},
{
"epoch": 7.449324324324325,
"grad_norm": 0.361871600151062,
"learning_rate": 3.708629204495371e-05,
"loss": 0.7195,
"step": 2205
},
{
"epoch": 7.466216216216216,
"grad_norm": 0.3652123510837555,
"learning_rate": 3.662906586879542e-05,
"loss": 0.7132,
"step": 2210
},
{
"epoch": 7.483108108108108,
"grad_norm": 0.36584657430648804,
"learning_rate": 3.61740429242975e-05,
"loss": 0.71,
"step": 2215
},
{
"epoch": 7.5,
"grad_norm": 0.34037116169929504,
"learning_rate": 3.5721239031346066e-05,
"loss": 0.7175,
"step": 2220
},
{
"epoch": 7.516891891891892,
"grad_norm": 0.34989210963249207,
"learning_rate": 3.5270669932676926e-05,
"loss": 0.7236,
"step": 2225
},
{
"epoch": 7.533783783783784,
"grad_norm": 0.35882651805877686,
"learning_rate": 3.48223512933282e-05,
"loss": 0.7159,
"step": 2230
},
{
"epoch": 7.550675675675675,
"grad_norm": 0.32638296484947205,
"learning_rate": 3.437629870009591e-05,
"loss": 0.7221,
"step": 2235
},
{
"epoch": 7.5675675675675675,
"grad_norm": 0.37272724509239197,
"learning_rate": 3.393252766099187e-05,
"loss": 0.7132,
"step": 2240
},
{
"epoch": 7.58445945945946,
"grad_norm": 0.3713020086288452,
"learning_rate": 3.349105360470456e-05,
"loss": 0.7246,
"step": 2245
},
{
"epoch": 7.601351351351351,
"grad_norm": 0.35202324390411377,
"learning_rate": 3.305189188006281e-05,
"loss": 0.7289,
"step": 2250
},
{
"epoch": 7.618243243243243,
"grad_norm": 0.3543793559074402,
"learning_rate": 3.2615057755502e-05,
"loss": 0.7129,
"step": 2255
},
{
"epoch": 7.635135135135135,
"grad_norm": 0.3830936849117279,
"learning_rate": 3.218056641853337e-05,
"loss": 0.7287,
"step": 2260
},
{
"epoch": 7.652027027027027,
"grad_norm": 0.36788904666900635,
"learning_rate": 3.174843297521596e-05,
"loss": 0.7107,
"step": 2265
},
{
"epoch": 7.668918918918919,
"grad_norm": 0.34784045815467834,
"learning_rate": 3.1318672449631284e-05,
"loss": 0.7129,
"step": 2270
},
{
"epoch": 7.6858108108108105,
"grad_norm": 0.3825985789299011,
"learning_rate": 3.089129978336118e-05,
"loss": 0.7048,
"step": 2275
},
{
"epoch": 7.702702702702703,
"grad_norm": 0.4050070643424988,
"learning_rate": 3.0466329834968233e-05,
"loss": 0.7165,
"step": 2280
},
{
"epoch": 7.719594594594595,
"grad_norm": 0.3602808117866516,
"learning_rate": 3.0043777379479098e-05,
"loss": 0.7163,
"step": 2285
},
{
"epoch": 7.736486486486487,
"grad_norm": 0.35466307401657104,
"learning_rate": 2.9623657107870996e-05,
"loss": 0.7149,
"step": 2290
},
{
"epoch": 7.753378378378378,
"grad_norm": 0.3452269732952118,
"learning_rate": 2.9205983626560874e-05,
"loss": 0.7196,
"step": 2295
},
{
"epoch": 7.77027027027027,
"grad_norm": 0.3634475767612457,
"learning_rate": 2.879077145689746e-05,
"loss": 0.7153,
"step": 2300
},
{
"epoch": 7.787162162162162,
"grad_norm": 0.3627691864967346,
"learning_rate": 2.8378035034656625e-05,
"loss": 0.7112,
"step": 2305
},
{
"epoch": 7.804054054054054,
"grad_norm": 0.3404904901981354,
"learning_rate": 2.7967788709539233e-05,
"loss": 0.7159,
"step": 2310
},
{
"epoch": 7.820945945945946,
"grad_norm": 0.38526642322540283,
"learning_rate": 2.7560046744672495e-05,
"loss": 0.7218,
"step": 2315
},
{
"epoch": 7.837837837837838,
"grad_norm": 0.354755699634552,
"learning_rate": 2.7154823316113932e-05,
"loss": 0.7123,
"step": 2320
},
{
"epoch": 7.85472972972973,
"grad_norm": 0.3782195746898651,
"learning_rate": 2.6752132512358475e-05,
"loss": 0.7091,
"step": 2325
},
{
"epoch": 7.871621621621622,
"grad_norm": 0.39233171939849854,
"learning_rate": 2.6351988333848788e-05,
"loss": 0.7208,
"step": 2330
},
{
"epoch": 7.888513513513513,
"grad_norm": 0.4432124197483063,
"learning_rate": 2.5954404692488433e-05,
"loss": 0.7032,
"step": 2335
},
{
"epoch": 7.905405405405405,
"grad_norm": 0.3653867542743683,
"learning_rate": 2.5559395411158115e-05,
"loss": 0.7246,
"step": 2340
},
{
"epoch": 7.922297297297297,
"grad_norm": 0.37708407640457153,
"learning_rate": 2.5166974223235296e-05,
"loss": 0.7135,
"step": 2345
},
{
"epoch": 7.9391891891891895,
"grad_norm": 0.3550487160682678,
"learning_rate": 2.4777154772116496e-05,
"loss": 0.7105,
"step": 2350
},
{
"epoch": 7.956081081081081,
"grad_norm": 0.35054445266723633,
"learning_rate": 2.438995061074314e-05,
"loss": 0.7179,
"step": 2355
},
{
"epoch": 7.972972972972973,
"grad_norm": 0.35555845499038696,
"learning_rate": 2.4005375201130274e-05,
"loss": 0.7076,
"step": 2360
},
{
"epoch": 7.989864864864865,
"grad_norm": 0.38198524713516235,
"learning_rate": 2.362344191389846e-05,
"loss": 0.7117,
"step": 2365
},
{
"epoch": 8.0,
"eval_loss": 2.655390977859497,
"eval_runtime": 0.3941,
"eval_samples_per_second": 5.074,
"eval_steps_per_second": 2.537,
"step": 2368
},
{
"epoch": 8.006756756756756,
"grad_norm": 0.2672366499900818,
"learning_rate": 2.324416402780907e-05,
"loss": 0.7016,
"step": 2370
},
{
"epoch": 8.02364864864865,
"grad_norm": 0.3170325756072998,
"learning_rate": 2.2867554729302542e-05,
"loss": 0.6812,
"step": 2375
},
{
"epoch": 8.04054054054054,
"grad_norm": 0.3713083863258362,
"learning_rate": 2.249362711203985e-05,
"loss": 0.6825,
"step": 2380
},
{
"epoch": 8.057432432432432,
"grad_norm": 0.3441585898399353,
"learning_rate": 2.2122394176447416e-05,
"loss": 0.6786,
"step": 2385
},
{
"epoch": 8.074324324324325,
"grad_norm": 0.29649627208709717,
"learning_rate": 2.1753868829265046e-05,
"loss": 0.671,
"step": 2390
},
{
"epoch": 8.091216216216216,
"grad_norm": 0.31710395216941833,
"learning_rate": 2.1388063883097152e-05,
"loss": 0.6788,
"step": 2395
},
{
"epoch": 8.108108108108109,
"grad_norm": 0.3464438319206238,
"learning_rate": 2.102499205596743e-05,
"loss": 0.6843,
"step": 2400
},
{
"epoch": 8.125,
"grad_norm": 0.3463502824306488,
"learning_rate": 2.0664665970876496e-05,
"loss": 0.6896,
"step": 2405
},
{
"epoch": 8.141891891891891,
"grad_norm": 0.32347431778907776,
"learning_rate": 2.0307098155363236e-05,
"loss": 0.6949,
"step": 2410
},
{
"epoch": 8.158783783783784,
"grad_norm": 0.30408981442451477,
"learning_rate": 1.9952301041069122e-05,
"loss": 0.6808,
"step": 2415
},
{
"epoch": 8.175675675675675,
"grad_norm": 0.3631693124771118,
"learning_rate": 1.9600286963305957e-05,
"loss": 0.6882,
"step": 2420
},
{
"epoch": 8.192567567567568,
"grad_norm": 0.31960511207580566,
"learning_rate": 1.9251068160627173e-05,
"loss": 0.6849,
"step": 2425
},
{
"epoch": 8.20945945945946,
"grad_norm": 0.3153926134109497,
"learning_rate": 1.8904656774402208e-05,
"loss": 0.6768,
"step": 2430
},
{
"epoch": 8.22635135135135,
"grad_norm": 0.3084424138069153,
"learning_rate": 1.8561064848394382e-05,
"loss": 0.6744,
"step": 2435
},
{
"epoch": 8.243243243243244,
"grad_norm": 0.3217174708843231,
"learning_rate": 1.8220304328342252e-05,
"loss": 0.6882,
"step": 2440
},
{
"epoch": 8.260135135135135,
"grad_norm": 0.3653244972229004,
"learning_rate": 1.7882387061544182e-05,
"loss": 0.6812,
"step": 2445
},
{
"epoch": 8.277027027027026,
"grad_norm": 0.32076555490493774,
"learning_rate": 1.754732479644655e-05,
"loss": 0.6835,
"step": 2450
},
{
"epoch": 8.29391891891892,
"grad_norm": 0.35145509243011475,
"learning_rate": 1.721512918223527e-05,
"loss": 0.6885,
"step": 2455
},
{
"epoch": 8.31081081081081,
"grad_norm": 0.3196760416030884,
"learning_rate": 1.688581176843066e-05,
"loss": 0.6814,
"step": 2460
},
{
"epoch": 8.327702702702704,
"grad_norm": 0.34739652276039124,
"learning_rate": 1.6559384004486055e-05,
"loss": 0.6856,
"step": 2465
},
{
"epoch": 8.344594594594595,
"grad_norm": 0.3565291166305542,
"learning_rate": 1.6235857239389696e-05,
"loss": 0.6849,
"step": 2470
},
{
"epoch": 8.361486486486486,
"grad_norm": 0.3656858205795288,
"learning_rate": 1.5915242721270074e-05,
"loss": 0.681,
"step": 2475
},
{
"epoch": 8.378378378378379,
"grad_norm": 0.32651442289352417,
"learning_rate": 1.5597551597004966e-05,
"loss": 0.683,
"step": 2480
},
{
"epoch": 8.39527027027027,
"grad_norm": 0.3386393189430237,
"learning_rate": 1.5282794911833887e-05,
"loss": 0.6823,
"step": 2485
},
{
"epoch": 8.412162162162161,
"grad_norm": 0.31998586654663086,
"learning_rate": 1.4970983608973942e-05,
"loss": 0.6788,
"step": 2490
},
{
"epoch": 8.429054054054054,
"grad_norm": 0.34341830015182495,
"learning_rate": 1.4662128529239572e-05,
"loss": 0.6944,
"step": 2495
},
{
"epoch": 8.445945945945946,
"grad_norm": 0.32450416684150696,
"learning_rate": 1.4356240410665433e-05,
"loss": 0.6946,
"step": 2500
},
{
"epoch": 8.462837837837839,
"grad_norm": 0.3322451710700989,
"learning_rate": 1.4053329888133238e-05,
"loss": 0.683,
"step": 2505
},
{
"epoch": 8.47972972972973,
"grad_norm": 0.3628733456134796,
"learning_rate": 1.3753407493001968e-05,
"loss": 0.6824,
"step": 2510
},
{
"epoch": 8.496621621621621,
"grad_norm": 0.3203790783882141,
"learning_rate": 1.3456483652741591e-05,
"loss": 0.6843,
"step": 2515
},
{
"epoch": 8.513513513513514,
"grad_norm": 0.3382638096809387,
"learning_rate": 1.3162568690570743e-05,
"loss": 0.6882,
"step": 2520
},
{
"epoch": 8.530405405405405,
"grad_norm": 0.34006133675575256,
"learning_rate": 1.287167282509767e-05,
"loss": 0.6781,
"step": 2525
},
{
"epoch": 8.547297297297296,
"grad_norm": 0.33302438259124756,
"learning_rate": 1.2583806169964961e-05,
"loss": 0.6818,
"step": 2530
},
{
"epoch": 8.56418918918919,
"grad_norm": 0.35714635252952576,
"learning_rate": 1.2298978733498035e-05,
"loss": 0.6903,
"step": 2535
},
{
"epoch": 8.58108108108108,
"grad_norm": 0.34445202350616455,
"learning_rate": 1.2017200418357078e-05,
"loss": 0.6884,
"step": 2540
},
{
"epoch": 8.597972972972974,
"grad_norm": 0.35791710019111633,
"learning_rate": 1.1738481021192704e-05,
"loss": 0.6805,
"step": 2545
},
{
"epoch": 8.614864864864865,
"grad_norm": 0.4606862962245941,
"learning_rate": 1.14628302323056e-05,
"loss": 0.6833,
"step": 2550
},
{
"epoch": 8.631756756756756,
"grad_norm": 0.3396778702735901,
"learning_rate": 1.1190257635309275e-05,
"loss": 0.6788,
"step": 2555
},
{
"epoch": 8.64864864864865,
"grad_norm": 0.3137703537940979,
"learning_rate": 1.0920772706797167e-05,
"loss": 0.6778,
"step": 2560
},
{
"epoch": 8.66554054054054,
"grad_norm": 0.3266281187534332,
"learning_rate": 1.0654384816012953e-05,
"loss": 0.6928,
"step": 2565
},
{
"epoch": 8.682432432432432,
"grad_norm": 0.33806994557380676,
"learning_rate": 1.0391103224524956e-05,
"loss": 0.694,
"step": 2570
},
{
"epoch": 8.699324324324325,
"grad_norm": 0.3242711126804352,
"learning_rate": 1.013093708590408e-05,
"loss": 0.6769,
"step": 2575
},
{
"epoch": 8.716216216216216,
"grad_norm": 0.3551606833934784,
"learning_rate": 9.873895445405523e-06,
"loss": 0.6824,
"step": 2580
},
{
"epoch": 8.733108108108109,
"grad_norm": 0.34394511580467224,
"learning_rate": 9.619987239654405e-06,
"loss": 0.681,
"step": 2585
},
{
"epoch": 8.75,
"grad_norm": 0.35514023900032043,
"learning_rate": 9.369221296335006e-06,
"loss": 0.6908,
"step": 2590
},
{
"epoch": 8.766891891891891,
"grad_norm": 0.31281572580337524,
"learning_rate": 9.121606333883792e-06,
"loss": 0.6881,
"step": 2595
},
{
"epoch": 8.783783783783784,
"grad_norm": 0.3141974210739136,
"learning_rate": 8.87715096118642e-06,
"loss": 0.6797,
"step": 2600
},
{
"epoch": 8.800675675675675,
"grad_norm": 0.3446739912033081,
"learning_rate": 8.635863677278378e-06,
"loss": 0.6862,
"step": 2605
},
{
"epoch": 8.817567567567568,
"grad_norm": 0.3194230794906616,
"learning_rate": 8.397752871049436e-06,
"loss": 0.6764,
"step": 2610
},
{
"epoch": 8.83445945945946,
"grad_norm": 0.3229275047779083,
"learning_rate": 8.162826820952097e-06,
"loss": 0.6868,
"step": 2615
},
{
"epoch": 8.85135135135135,
"grad_norm": 0.3260205388069153,
"learning_rate": 7.931093694713687e-06,
"loss": 0.6917,
"step": 2620
},
{
"epoch": 8.868243243243244,
"grad_norm": 0.3324912190437317,
"learning_rate": 7.702561549052445e-06,
"loss": 0.6748,
"step": 2625
},
{
"epoch": 8.885135135135135,
"grad_norm": 0.3662506937980652,
"learning_rate": 7.477238329397418e-06,
"loss": 0.6918,
"step": 2630
},
{
"epoch": 8.902027027027026,
"grad_norm": 0.3210934102535248,
"learning_rate": 7.255131869612108e-06,
"loss": 0.694,
"step": 2635
},
{
"epoch": 8.91891891891892,
"grad_norm": 0.35362377762794495,
"learning_rate": 7.03624989172228e-06,
"loss": 0.678,
"step": 2640
},
{
"epoch": 8.93581081081081,
"grad_norm": 0.3684268295764923,
"learning_rate": 6.820600005647382e-06,
"loss": 0.6913,
"step": 2645
},
{
"epoch": 8.952702702702704,
"grad_norm": 0.3233438730239868,
"learning_rate": 6.608189708935964e-06,
"loss": 0.6818,
"step": 2650
},
{
"epoch": 8.969594594594595,
"grad_norm": 0.3141653537750244,
"learning_rate": 6.3990263865050695e-06,
"loss": 0.6843,
"step": 2655
},
{
"epoch": 8.986486486486486,
"grad_norm": 0.32477766275405884,
"learning_rate": 6.1931173103834115e-06,
"loss": 0.6916,
"step": 2660
},
{
"epoch": 9.0,
"eval_loss": 2.91719913482666,
"eval_runtime": 0.3931,
"eval_samples_per_second": 5.088,
"eval_steps_per_second": 2.544,
"step": 2664
},
{
"epoch": 9.003378378378379,
"grad_norm": 0.26982101798057556,
"learning_rate": 5.9904696394586405e-06,
"loss": 0.6797,
"step": 2665
},
{
"epoch": 9.02027027027027,
"grad_norm": 0.25923973321914673,
"learning_rate": 5.791090419228351e-06,
"loss": 0.6622,
"step": 2670
},
{
"epoch": 9.037162162162161,
"grad_norm": 0.2826623022556305,
"learning_rate": 5.594986581555173e-06,
"loss": 0.6712,
"step": 2675
},
{
"epoch": 9.054054054054054,
"grad_norm": 0.3254013657569885,
"learning_rate": 5.402164944425758e-06,
"loss": 0.6644,
"step": 2680
},
{
"epoch": 9.070945945945946,
"grad_norm": 0.3010416626930237,
"learning_rate": 5.212632211713797e-06,
"loss": 0.6741,
"step": 2685
},
{
"epoch": 9.087837837837839,
"grad_norm": 0.36080434918403625,
"learning_rate": 5.026394972946813e-06,
"loss": 0.6675,
"step": 2690
},
{
"epoch": 9.10472972972973,
"grad_norm": 0.2993578314781189,
"learning_rate": 4.843459703077202e-06,
"loss": 0.6798,
"step": 2695
},
{
"epoch": 9.121621621621621,
"grad_norm": 0.3179502785205841,
"learning_rate": 4.66383276225707e-06,
"loss": 0.6769,
"step": 2700
},
{
"epoch": 9.138513513513514,
"grad_norm": 0.29940682649612427,
"learning_rate": 4.487520395617029e-06,
"loss": 0.6624,
"step": 2705
},
{
"epoch": 9.155405405405405,
"grad_norm": 0.340571790933609,
"learning_rate": 4.314528733049206e-06,
"loss": 0.6626,
"step": 2710
},
{
"epoch": 9.172297297297296,
"grad_norm": 0.3252014219760895,
"learning_rate": 4.144863788993991e-06,
"loss": 0.6798,
"step": 2715
},
{
"epoch": 9.18918918918919,
"grad_norm": 0.3229629397392273,
"learning_rate": 3.9785314622310495e-06,
"loss": 0.675,
"step": 2720
},
{
"epoch": 9.20608108108108,
"grad_norm": 0.2807313799858093,
"learning_rate": 3.815537535674174e-06,
"loss": 0.6765,
"step": 2725
},
{
"epoch": 9.222972972972974,
"grad_norm": 0.2970227599143982,
"learning_rate": 3.655887676170222e-06,
"loss": 0.6678,
"step": 2730
},
{
"epoch": 9.239864864864865,
"grad_norm": 0.3028378486633301,
"learning_rate": 3.4995874343021094e-06,
"loss": 0.6728,
"step": 2735
},
{
"epoch": 9.256756756756756,
"grad_norm": 0.31875497102737427,
"learning_rate": 3.3466422441958634e-06,
"loss": 0.6761,
"step": 2740
},
{
"epoch": 9.27364864864865,
"grad_norm": 0.319791316986084,
"learning_rate": 3.1970574233316397e-06,
"loss": 0.6623,
"step": 2745
},
{
"epoch": 9.29054054054054,
"grad_norm": 0.2787954807281494,
"learning_rate": 3.050838172358883e-06,
"loss": 0.6679,
"step": 2750
},
{
"epoch": 9.307432432432432,
"grad_norm": 0.3424752950668335,
"learning_rate": 2.9079895749154927e-06,
"loss": 0.6626,
"step": 2755
},
{
"epoch": 9.324324324324325,
"grad_norm": 0.302796334028244,
"learning_rate": 2.7685165974510986e-06,
"loss": 0.6721,
"step": 2760
},
{
"epoch": 9.341216216216216,
"grad_norm": 0.29907652735710144,
"learning_rate": 2.6324240890544193e-06,
"loss": 0.6696,
"step": 2765
},
{
"epoch": 9.358108108108109,
"grad_norm": 0.31798794865608215,
"learning_rate": 2.499716781284556e-06,
"loss": 0.6705,
"step": 2770
},
{
"epoch": 9.375,
"grad_norm": 0.3437163829803467,
"learning_rate": 2.3703992880066638e-06,
"loss": 0.6726,
"step": 2775
},
{
"epoch": 9.391891891891891,
"grad_norm": 0.2871228754520416,
"learning_rate": 2.2444761052313856e-06,
"loss": 0.6715,
"step": 2780
},
{
"epoch": 9.408783783783784,
"grad_norm": 0.2895369827747345,
"learning_rate": 2.1219516109586056e-06,
"loss": 0.6812,
"step": 2785
},
{
"epoch": 9.425675675675675,
"grad_norm": 0.29224950075149536,
"learning_rate": 2.002830065025263e-06,
"loss": 0.6788,
"step": 2790
},
{
"epoch": 9.442567567567568,
"grad_norm": 0.32183554768562317,
"learning_rate": 1.8871156089572018e-06,
"loss": 0.6731,
"step": 2795
},
{
"epoch": 9.45945945945946,
"grad_norm": 0.30122798681259155,
"learning_rate": 1.7748122658251876e-06,
"loss": 0.6724,
"step": 2800
},
{
"epoch": 9.47635135135135,
"grad_norm": 0.3141665458679199,
"learning_rate": 1.665923940105074e-06,
"loss": 0.6725,
"step": 2805
},
{
"epoch": 9.493243243243244,
"grad_norm": 0.3088280260562897,
"learning_rate": 1.56045441754199e-06,
"loss": 0.6748,
"step": 2810
},
{
"epoch": 9.510135135135135,
"grad_norm": 0.2941664457321167,
"learning_rate": 1.4584073650187878e-06,
"loss": 0.6656,
"step": 2815
},
{
"epoch": 9.527027027027026,
"grad_norm": 0.3141193687915802,
"learning_rate": 1.3597863304285475e-06,
"loss": 0.6732,
"step": 2820
},
{
"epoch": 9.54391891891892,
"grad_norm": 0.29874399304389954,
"learning_rate": 1.2645947425511395e-06,
"loss": 0.6749,
"step": 2825
},
{
"epoch": 9.56081081081081,
"grad_norm": 0.2963665723800659,
"learning_rate": 1.1728359109341446e-06,
"loss": 0.6737,
"step": 2830
},
{
"epoch": 9.577702702702704,
"grad_norm": 0.2843508720397949,
"learning_rate": 1.0845130257777114e-06,
"loss": 0.6758,
"step": 2835
},
{
"epoch": 9.594594594594595,
"grad_norm": 0.3049289882183075,
"learning_rate": 9.996291578236228e-07,
"loss": 0.6771,
"step": 2840
},
{
"epoch": 9.611486486486486,
"grad_norm": 0.2939779460430145,
"learning_rate": 9.18187258248604e-07,
"loss": 0.6655,
"step": 2845
},
{
"epoch": 9.628378378378379,
"grad_norm": 0.2909776270389557,
"learning_rate": 8.401901585616823e-07,
"loss": 0.6745,
"step": 2850
},
{
"epoch": 9.64527027027027,
"grad_norm": 0.3179049491882324,
"learning_rate": 7.656405705057435e-07,
"loss": 0.6621,
"step": 2855
},
{
"epoch": 9.662162162162161,
"grad_norm": 0.31932151317596436,
"learning_rate": 6.945410859632295e-07,
"loss": 0.6615,
"step": 2860
},
{
"epoch": 9.679054054054054,
"grad_norm": 0.33364373445510864,
"learning_rate": 6.268941768660886e-07,
"loss": 0.6752,
"step": 2865
},
{
"epoch": 9.695945945945946,
"grad_norm": 0.3177048861980438,
"learning_rate": 5.627021951097545e-07,
"loss": 0.6793,
"step": 2870
},
{
"epoch": 9.712837837837839,
"grad_norm": 0.3155570328235626,
"learning_rate": 5.019673724714458e-07,
"loss": 0.6557,
"step": 2875
},
{
"epoch": 9.72972972972973,
"grad_norm": 0.2894674241542816,
"learning_rate": 4.44691820532539e-07,
"loss": 0.6679,
"step": 2880
},
{
"epoch": 9.746621621621621,
"grad_norm": 0.2958497405052185,
"learning_rate": 3.908775306051604e-07,
"loss": 0.667,
"step": 2885
},
{
"epoch": 9.763513513513514,
"grad_norm": 0.30099859833717346,
"learning_rate": 3.405263736629416e-07,
"loss": 0.6828,
"step": 2890
},
{
"epoch": 9.780405405405405,
"grad_norm": 0.28636157512664795,
"learning_rate": 2.9364010027599364e-07,
"loss": 0.6692,
"step": 2895
},
{
"epoch": 9.797297297297296,
"grad_norm": 0.3025320768356323,
"learning_rate": 2.5022034055003364e-07,
"loss": 0.6735,
"step": 2900
},
{
"epoch": 9.81418918918919,
"grad_norm": 0.3249851167201996,
"learning_rate": 2.1026860406970772e-07,
"loss": 0.6678,
"step": 2905
},
{
"epoch": 9.83108108108108,
"grad_norm": 0.29301130771636963,
"learning_rate": 1.7378627984612207e-07,
"loss": 0.6773,
"step": 2910
},
{
"epoch": 9.847972972972974,
"grad_norm": 0.3139761686325073,
"learning_rate": 1.4077463626852582e-07,
"loss": 0.6585,
"step": 2915
},
{
"epoch": 9.864864864864865,
"grad_norm": 0.2988782227039337,
"learning_rate": 1.1123482106021322e-07,
"loss": 0.6832,
"step": 2920
},
{
"epoch": 9.881756756756756,
"grad_norm": 0.34064996242523193,
"learning_rate": 8.516786123867748e-08,
"loss": 0.6655,
"step": 2925
},
{
"epoch": 9.89864864864865,
"grad_norm": 0.31197425723075867,
"learning_rate": 6.25746630798063e-08,
"loss": 0.666,
"step": 2930
},
{
"epoch": 9.91554054054054,
"grad_norm": 0.3064212203025818,
"learning_rate": 4.3456012086462436e-08,
"loss": 0.6671,
"step": 2935
},
{
"epoch": 9.932432432432432,
"grad_norm": 0.3351791501045227,
"learning_rate": 2.7812572961127824e-08,
"loss": 0.6746,
"step": 2940
},
{
"epoch": 9.949324324324325,
"grad_norm": 0.2936666011810303,
"learning_rate": 1.564488958279986e-08,
"loss": 0.6647,
"step": 2945
},
{
"epoch": 9.966216216216216,
"grad_norm": 0.29400986433029175,
"learning_rate": 6.953384988095391e-09,
"loss": 0.6718,
"step": 2950
},
{
"epoch": 9.983108108108109,
"grad_norm": 0.32395681738853455,
"learning_rate": 1.7383613565291612e-09,
"loss": 0.6809,
"step": 2955
},
{
"epoch": 10.0,
"grad_norm": 0.2938132882118225,
"learning_rate": 0.0,
"loss": 0.6652,
"step": 2960
},
{
"epoch": 10.0,
"eval_loss": 3.0329792499542236,
"eval_runtime": 0.4325,
"eval_samples_per_second": 4.624,
"eval_steps_per_second": 2.312,
"step": 2960
},
{
"epoch": 10.0,
"step": 2960,
"total_flos": 4.416382035459834e+18,
"train_loss": 0.922980490487975,
"train_runtime": 12382.7598,
"train_samples_per_second": 7.645,
"train_steps_per_second": 0.239
}
],
"logging_steps": 5,
"max_steps": 2960,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.416382035459834e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}