{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 10.0,
  "eval_steps": 500,
  "global_step": 2960,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0033783783783783786,
      "grad_norm": 9.802501678466797,
      "learning_rate": 6.756756756756758e-07,
      "loss": 2.6921,
      "step": 1
    },
    {
      "epoch": 0.016891891891891893,
      "grad_norm": 10.374316215515137,
      "learning_rate": 3.3783783783783788e-06,
      "loss": 2.7409,
      "step": 5
    },
    {
      "epoch": 0.033783783783783786,
      "grad_norm": 9.445246696472168,
      "learning_rate": 6.7567567567567575e-06,
      "loss": 2.6534,
      "step": 10
    },
    {
      "epoch": 0.05067567567567568,
      "grad_norm": 3.71943736076355,
      "learning_rate": 1.0135135135135136e-05,
      "loss": 2.4339,
      "step": 15
    },
    {
      "epoch": 0.06756756756756757,
      "grad_norm": 1.7139111757278442,
      "learning_rate": 1.3513513513513515e-05,
      "loss": 2.2659,
      "step": 20
    },
    {
      "epoch": 0.08445945945945946,
      "grad_norm": 0.7590915560722351,
      "learning_rate": 1.6891891891891892e-05,
      "loss": 2.1065,
      "step": 25
    },
    {
      "epoch": 0.10135135135135136,
      "grad_norm": 0.6881681680679321,
      "learning_rate": 2.0270270270270273e-05,
      "loss": 1.9905,
      "step": 30
    },
    {
      "epoch": 0.11824324324324324,
      "grad_norm": 0.6322100162506104,
      "learning_rate": 2.364864864864865e-05,
      "loss": 1.8675,
      "step": 35
    },
    {
      "epoch": 0.13513513513513514,
      "grad_norm": 0.6217833757400513,
      "learning_rate": 2.702702702702703e-05,
      "loss": 1.7354,
      "step": 40
    },
    {
      "epoch": 0.15202702702702703,
      "grad_norm": 0.4574459493160248,
      "learning_rate": 3.0405405405405407e-05,
      "loss": 1.6276,
      "step": 45
    },
    {
      "epoch": 0.16891891891891891,
      "grad_norm": 0.3643452525138855,
      "learning_rate": 3.3783783783783784e-05,
      "loss": 1.5622,
      "step": 50
    },
    {
      "epoch": 0.1858108108108108,
      "grad_norm": 0.6475837230682373,
      "learning_rate": 3.7162162162162165e-05,
      "loss": 1.5175,
      "step": 55
    },
    {
      "epoch": 0.20270270270270271,
      "grad_norm": 0.29574820399284363,
      "learning_rate": 4.0540540540540545e-05,
      "loss": 1.4953,
      "step": 60
    },
    {
      "epoch": 0.2195945945945946,
      "grad_norm": 0.31240248680114746,
      "learning_rate": 4.391891891891892e-05,
      "loss": 1.4493,
      "step": 65
    },
    {
      "epoch": 0.23648648648648649,
      "grad_norm": 0.2868952751159668,
      "learning_rate": 4.72972972972973e-05,
      "loss": 1.4327,
      "step": 70
    },
    {
      "epoch": 0.2533783783783784,
      "grad_norm": 0.3093927800655365,
      "learning_rate": 5.067567567567568e-05,
      "loss": 1.4149,
      "step": 75
    },
    {
      "epoch": 0.2702702702702703,
      "grad_norm": 0.37402284145355225,
      "learning_rate": 5.405405405405406e-05,
      "loss": 1.3976,
      "step": 80
    },
    {
      "epoch": 0.28716216216216217,
      "grad_norm": 0.34647682309150696,
      "learning_rate": 5.7432432432432434e-05,
      "loss": 1.3977,
      "step": 85
    },
    {
      "epoch": 0.30405405405405406,
      "grad_norm": 0.30035659670829773,
      "learning_rate": 6.0810810810810814e-05,
      "loss": 1.3589,
      "step": 90
    },
    {
      "epoch": 0.32094594594594594,
      "grad_norm": 0.33794718980789185,
      "learning_rate": 6.41891891891892e-05,
      "loss": 1.3818,
      "step": 95
    },
    {
      "epoch": 0.33783783783783783,
      "grad_norm": 0.40184497833251953,
      "learning_rate": 6.756756756756757e-05,
      "loss": 1.3577,
      "step": 100
    },
    {
      "epoch": 0.3547297297297297,
      "grad_norm": 0.32776907086372375,
      "learning_rate": 7.094594594594594e-05,
      "loss": 1.3408,
      "step": 105
    },
    {
      "epoch": 0.3716216216216216,
      "grad_norm": 0.32861512899398804,
      "learning_rate": 7.432432432432433e-05,
      "loss": 1.3036,
      "step": 110
    },
    {
      "epoch": 0.3885135135135135,
      "grad_norm": 0.3542137145996094,
      "learning_rate": 7.77027027027027e-05,
      "loss": 1.3261,
      "step": 115
    },
    {
      "epoch": 0.40540540540540543,
      "grad_norm": 0.3485589921474457,
      "learning_rate": 8.108108108108109e-05,
      "loss": 1.3107,
      "step": 120
    },
    {
      "epoch": 0.4222972972972973,
      "grad_norm": 0.3495419919490814,
      "learning_rate": 8.445945945945946e-05,
      "loss": 1.2784,
      "step": 125
    },
    {
      "epoch": 0.4391891891891892,
      "grad_norm": 0.3283160626888275,
      "learning_rate": 8.783783783783784e-05,
      "loss": 1.2816,
      "step": 130
    },
    {
      "epoch": 0.4560810810810811,
      "grad_norm": 0.331221342086792,
      "learning_rate": 9.121621621621623e-05,
      "loss": 1.2697,
      "step": 135
    },
    {
      "epoch": 0.47297297297297297,
      "grad_norm": 0.38272470235824585,
      "learning_rate": 9.45945945945946e-05,
      "loss": 1.2806,
      "step": 140
    },
    {
      "epoch": 0.48986486486486486,
      "grad_norm": 0.3326016962528229,
      "learning_rate": 9.797297297297297e-05,
      "loss": 1.2729,
      "step": 145
    },
    {
      "epoch": 0.5067567567567568,
      "grad_norm": 0.31695079803466797,
      "learning_rate": 0.00010135135135135136,
      "loss": 1.2657,
      "step": 150
    },
    {
      "epoch": 0.5236486486486487,
      "grad_norm": 0.40642571449279785,
      "learning_rate": 0.00010472972972972975,
      "loss": 1.2454,
      "step": 155
    },
    {
      "epoch": 0.5405405405405406,
      "grad_norm": 0.3561699688434601,
      "learning_rate": 0.00010810810810810812,
      "loss": 1.2414,
      "step": 160
    },
    {
      "epoch": 0.5574324324324325,
      "grad_norm": 0.30583736300468445,
      "learning_rate": 0.0001114864864864865,
      "loss": 1.2473,
      "step": 165
    },
    {
      "epoch": 0.5743243243243243,
      "grad_norm": 0.3610832393169403,
      "learning_rate": 0.00011486486486486487,
      "loss": 1.2487,
      "step": 170
    },
    {
      "epoch": 0.5912162162162162,
      "grad_norm": 0.33005717396736145,
      "learning_rate": 0.00011824324324324326,
      "loss": 1.2512,
      "step": 175
    },
    {
      "epoch": 0.6081081081081081,
      "grad_norm": 0.3080041706562042,
      "learning_rate": 0.00012162162162162163,
      "loss": 1.2544,
      "step": 180
    },
    {
      "epoch": 0.625,
      "grad_norm": 0.3453957736492157,
      "learning_rate": 0.000125,
      "loss": 1.2329,
      "step": 185
    },
    {
      "epoch": 0.6418918918918919,
      "grad_norm": 0.4040939211845398,
      "learning_rate": 0.0001283783783783784,
      "loss": 1.2356,
      "step": 190
    },
    {
      "epoch": 0.6587837837837838,
      "grad_norm": 0.39047908782958984,
      "learning_rate": 0.00013175675675675675,
      "loss": 1.2215,
      "step": 195
    },
    {
      "epoch": 0.6756756756756757,
      "grad_norm": 0.27441543340682983,
      "learning_rate": 0.00013513513513513514,
      "loss": 1.2374,
      "step": 200
    },
    {
      "epoch": 0.6925675675675675,
      "grad_norm": 0.26817697286605835,
      "learning_rate": 0.00013851351351351352,
      "loss": 1.2446,
      "step": 205
    },
    {
      "epoch": 0.7094594594594594,
      "grad_norm": 0.4692605435848236,
      "learning_rate": 0.00014189189189189188,
      "loss": 1.2369,
      "step": 210
    },
    {
      "epoch": 0.7263513513513513,
      "grad_norm": 0.47006717324256897,
      "learning_rate": 0.00014527027027027027,
      "loss": 1.2289,
      "step": 215
    },
    {
      "epoch": 0.7432432432432432,
      "grad_norm": 0.26643019914627075,
      "learning_rate": 0.00014864864864864866,
      "loss": 1.2272,
      "step": 220
    },
    {
      "epoch": 0.7601351351351351,
      "grad_norm": 0.27256107330322266,
      "learning_rate": 0.00015202702702702702,
      "loss": 1.2301,
      "step": 225
    },
    {
      "epoch": 0.777027027027027,
      "grad_norm": 0.2612285912036896,
      "learning_rate": 0.0001554054054054054,
      "loss": 1.2303,
      "step": 230
    },
    {
      "epoch": 0.793918918918919,
      "grad_norm": 0.2759920656681061,
      "learning_rate": 0.0001587837837837838,
      "loss": 1.2177,
      "step": 235
    },
    {
      "epoch": 0.8108108108108109,
      "grad_norm": 0.29133257269859314,
      "learning_rate": 0.00016216216216216218,
      "loss": 1.2174,
      "step": 240
    },
    {
      "epoch": 0.8277027027027027,
      "grad_norm": 0.3231314420700073,
      "learning_rate": 0.00016554054054054057,
      "loss": 1.2036,
      "step": 245
    },
    {
      "epoch": 0.8445945945945946,
      "grad_norm": 0.27160102128982544,
      "learning_rate": 0.00016891891891891893,
      "loss": 1.2302,
      "step": 250
    },
    {
      "epoch": 0.8614864864864865,
      "grad_norm": 0.29660171270370483,
      "learning_rate": 0.00017229729729729732,
      "loss": 1.2033,
      "step": 255
    },
    {
      "epoch": 0.8783783783783784,
      "grad_norm": 0.2654610276222229,
      "learning_rate": 0.00017567567567567568,
      "loss": 1.2012,
      "step": 260
    },
    {
      "epoch": 0.8952702702702703,
      "grad_norm": 0.28142857551574707,
      "learning_rate": 0.00017905405405405406,
      "loss": 1.2052,
      "step": 265
    },
    {
      "epoch": 0.9121621621621622,
      "grad_norm": 0.24720372259616852,
      "learning_rate": 0.00018243243243243245,
      "loss": 1.192,
      "step": 270
    },
    {
      "epoch": 0.9290540540540541,
      "grad_norm": 0.2735718786716461,
      "learning_rate": 0.0001858108108108108,
      "loss": 1.213,
      "step": 275
    },
    {
      "epoch": 0.9459459459459459,
      "grad_norm": 0.30433931946754456,
      "learning_rate": 0.0001891891891891892,
      "loss": 1.2059,
      "step": 280
    },
    {
      "epoch": 0.9628378378378378,
      "grad_norm": 0.3330329358577728,
      "learning_rate": 0.00019256756756756758,
      "loss": 1.206,
      "step": 285
    },
    {
      "epoch": 0.9797297297297297,
      "grad_norm": 0.27602413296699524,
      "learning_rate": 0.00019594594594594594,
      "loss": 1.2043,
      "step": 290
    },
    {
      "epoch": 0.9966216216216216,
      "grad_norm": 0.23838359117507935,
      "learning_rate": 0.00019932432432432433,
      "loss": 1.2062,
      "step": 295
    },
    {
      "epoch": 1.0,
      "eval_loss": 1.6780701875686646,
      "eval_runtime": 0.3945,
      "eval_samples_per_second": 5.07,
      "eval_steps_per_second": 2.535,
      "step": 296
    },
    {
      "epoch": 1.0135135135135136,
      "grad_norm": 0.3066512644290924,
      "learning_rate": 0.00019999888744757143,
      "loss": 1.1826,
      "step": 300
    },
    {
      "epoch": 1.0304054054054055,
      "grad_norm": 0.42127561569213867,
      "learning_rate": 0.0001999943677457578,
      "loss": 1.1683,
      "step": 305
    },
    {
      "epoch": 1.0472972972972974,
      "grad_norm": 0.28215768933296204,
      "learning_rate": 0.000199986371517049,
      "loss": 1.1752,
      "step": 310
    },
    {
      "epoch": 1.0641891891891893,
      "grad_norm": 0.35595354437828064,
      "learning_rate": 0.0001999748990394517,
      "loss": 1.1515,
      "step": 315
    },
    {
      "epoch": 1.0810810810810811,
      "grad_norm": 0.23858019709587097,
      "learning_rate": 0.0001999599507118322,
      "loss": 1.1604,
      "step": 320
    },
    {
      "epoch": 1.097972972972973,
      "grad_norm": 0.2836330831050873,
      "learning_rate": 0.0001999415270539023,
      "loss": 1.1714,
      "step": 325
    },
    {
      "epoch": 1.114864864864865,
      "grad_norm": 0.28962427377700806,
      "learning_rate": 0.00019991962870620153,
      "loss": 1.1693,
      "step": 330
    },
    {
      "epoch": 1.1317567567567568,
      "grad_norm": 0.2537465989589691,
      "learning_rate": 0.00019989425643007476,
      "loss": 1.1537,
      "step": 335
    },
    {
      "epoch": 1.1486486486486487,
      "grad_norm": 0.23751677572727203,
      "learning_rate": 0.00019986541110764565,
      "loss": 1.1664,
      "step": 340
    },
    {
      "epoch": 1.1655405405405406,
      "grad_norm": 0.3039610981941223,
      "learning_rate": 0.0001998330937417861,
      "loss": 1.1607,
      "step": 345
    },
    {
      "epoch": 1.1824324324324325,
      "grad_norm": 0.22566653788089752,
      "learning_rate": 0.00019979730545608126,
      "loss": 1.1532,
      "step": 350
    },
    {
      "epoch": 1.1993243243243243,
      "grad_norm": 0.27842891216278076,
      "learning_rate": 0.00019975804749479062,
      "loss": 1.1589,
      "step": 355
    },
    {
      "epoch": 1.2162162162162162,
      "grad_norm": 0.2455698400735855,
      "learning_rate": 0.00019971532122280464,
      "loss": 1.1608,
      "step": 360
    },
    {
      "epoch": 1.2331081081081081,
      "grad_norm": 0.23679549992084503,
      "learning_rate": 0.00019966912812559732,
      "loss": 1.1691,
      "step": 365
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.22320061922073364,
      "learning_rate": 0.00019961946980917456,
      "loss": 1.1551,
      "step": 370
    },
    {
      "epoch": 1.2668918918918919,
      "grad_norm": 0.2794288992881775,
      "learning_rate": 0.00019956634800001832,
      "loss": 1.1667,
      "step": 375
    },
    {
      "epoch": 1.2837837837837838,
      "grad_norm": 0.2269154042005539,
      "learning_rate": 0.0001995097645450266,
      "loss": 1.1589,
      "step": 380
    },
    {
      "epoch": 1.3006756756756757,
      "grad_norm": 0.22751463949680328,
      "learning_rate": 0.00019944972141144928,
      "loss": 1.1522,
      "step": 385
    },
    {
      "epoch": 1.3175675675675675,
      "grad_norm": 0.2368728667497635,
      "learning_rate": 0.00019938622068681953,
      "loss": 1.1487,
      "step": 390
    },
    {
      "epoch": 1.3344594594594594,
      "grad_norm": 0.2409171611070633,
      "learning_rate": 0.00019931926457888156,
      "loss": 1.1575,
      "step": 395
    },
    {
      "epoch": 1.3513513513513513,
      "grad_norm": 0.24245265126228333,
      "learning_rate": 0.0001992488554155135,
      "loss": 1.1443,
      "step": 400
    },
    {
      "epoch": 1.3682432432432432,
      "grad_norm": 0.21953873336315155,
      "learning_rate": 0.0001991749956446468,
      "loss": 1.1578,
      "step": 405
    },
    {
      "epoch": 1.385135135135135,
      "grad_norm": 0.21402998268604279,
      "learning_rate": 0.00019909768783418086,
      "loss": 1.1655,
      "step": 410
    },
    {
      "epoch": 1.402027027027027,
      "grad_norm": 0.22115997970104218,
      "learning_rate": 0.00019901693467189386,
      "loss": 1.1515,
      "step": 415
    },
    {
      "epoch": 1.4189189189189189,
      "grad_norm": 0.2362441122531891,
      "learning_rate": 0.00019893273896534936,
      "loss": 1.1579,
      "step": 420
    },
    {
      "epoch": 1.4358108108108107,
      "grad_norm": 0.2779642641544342,
      "learning_rate": 0.0001988451036417986,
      "loss": 1.1518,
      "step": 425
    },
    {
      "epoch": 1.4527027027027026,
      "grad_norm": 0.22553110122680664,
      "learning_rate": 0.00019875403174807882,
      "loss": 1.1722,
      "step": 430
    },
    {
      "epoch": 1.4695945945945945,
      "grad_norm": 0.22423289716243744,
      "learning_rate": 0.0001986595264505072,
      "loss": 1.1628,
      "step": 435
    },
    {
      "epoch": 1.4864864864864864,
      "grad_norm": 0.23659999668598175,
      "learning_rate": 0.00019856159103477086,
      "loss": 1.1442,
      "step": 440
    },
    {
      "epoch": 1.5033783783783785,
      "grad_norm": 0.23966625332832336,
      "learning_rate": 0.00019846022890581267,
      "loss": 1.1486,
      "step": 445
    },
    {
      "epoch": 1.5202702702702702,
      "grad_norm": 0.2399033010005951,
      "learning_rate": 0.0001983554435877128,
      "loss": 1.144,
      "step": 450
    },
    {
      "epoch": 1.5371621621621623,
      "grad_norm": 0.2575773000717163,
      "learning_rate": 0.0001982472387235662,
      "loss": 1.1693,
      "step": 455
    },
    {
      "epoch": 1.554054054054054,
      "grad_norm": 0.23619942367076874,
      "learning_rate": 0.00019813561807535598,
      "loss": 1.1494,
      "step": 460
    },
    {
      "epoch": 1.570945945945946,
      "grad_norm": 0.24643085896968842,
      "learning_rate": 0.0001980205855238225,
      "loss": 1.1543,
      "step": 465
    },
    {
      "epoch": 1.5878378378378377,
      "grad_norm": 0.2060076743364334,
      "learning_rate": 0.00019790214506832868,
      "loss": 1.1597,
      "step": 470
    },
    {
      "epoch": 1.6047297297297298,
      "grad_norm": 0.20906926691532135,
      "learning_rate": 0.00019778030082672068,
      "loss": 1.1393,
      "step": 475
    },
    {
      "epoch": 1.6216216216216215,
      "grad_norm": 0.21041174232959747,
      "learning_rate": 0.00019765505703518496,
      "loss": 1.1519,
      "step": 480
    },
    {
      "epoch": 1.6385135135135136,
      "grad_norm": 0.21494755148887634,
      "learning_rate": 0.00019752641804810084,
      "loss": 1.1497,
      "step": 485
    },
    {
      "epoch": 1.6554054054054053,
      "grad_norm": 0.21202711760997772,
      "learning_rate": 0.0001973943883378892,
      "loss": 1.1579,
      "step": 490
    },
    {
      "epoch": 1.6722972972972974,
      "grad_norm": 0.20677632093429565,
      "learning_rate": 0.00019725897249485704,
      "loss": 1.1473,
      "step": 495
    },
    {
      "epoch": 1.689189189189189,
      "grad_norm": 0.2177901715040207,
      "learning_rate": 0.00019712017522703764,
      "loss": 1.154,
      "step": 500
    },
    {
      "epoch": 1.7060810810810811,
      "grad_norm": 0.212003692984581,
      "learning_rate": 0.0001969780013600272,
      "loss": 1.1608,
      "step": 505
    },
    {
      "epoch": 1.722972972972973,
      "grad_norm": 0.21401935815811157,
      "learning_rate": 0.00019683245583681675,
      "loss": 1.1619,
      "step": 510
    },
    {
      "epoch": 1.739864864864865,
      "grad_norm": 0.22224700450897217,
      "learning_rate": 0.00019668354371762066,
      "loss": 1.1565,
      "step": 515
    },
    {
      "epoch": 1.7567567567567568,
      "grad_norm": 0.2198743373155594,
      "learning_rate": 0.00019653127017970034,
      "loss": 1.148,
      "step": 520
    },
    {
      "epoch": 1.7736486486486487,
      "grad_norm": 0.21117670834064484,
      "learning_rate": 0.0001963756405171845,
      "loss": 1.1567,
      "step": 525
    },
    {
      "epoch": 1.7905405405405406,
      "grad_norm": 0.23106643557548523,
      "learning_rate": 0.00019621666014088494,
      "loss": 1.1417,
      "step": 530
    },
    {
      "epoch": 1.8074324324324325,
      "grad_norm": 0.20598255097866058,
      "learning_rate": 0.00019605433457810855,
      "loss": 1.1491,
      "step": 535
    },
    {
      "epoch": 1.8243243243243243,
      "grad_norm": 0.2185199111700058,
      "learning_rate": 0.00019588866947246498,
      "loss": 1.1474,
      "step": 540
    },
    {
      "epoch": 1.8412162162162162,
      "grad_norm": 0.21996720135211945,
      "learning_rate": 0.00019571967058367064,
      "loss": 1.1574,
      "step": 545
    },
    {
      "epoch": 1.8581081081081081,
      "grad_norm": 0.205213725566864,
      "learning_rate": 0.00019554734378734824,
      "loss": 1.1596,
      "step": 550
    },
    {
      "epoch": 1.875,
      "grad_norm": 0.19933567941188812,
      "learning_rate": 0.0001953716950748227,
      "loss": 1.1466,
      "step": 555
    },
    {
      "epoch": 1.8918918918918919,
      "grad_norm": 0.19704587757587433,
      "learning_rate": 0.00019519273055291266,
      "loss": 1.1399,
      "step": 560
    },
    {
      "epoch": 1.9087837837837838,
      "grad_norm": 0.20990757644176483,
      "learning_rate": 0.00019501045644371832,
      "loss": 1.1363,
      "step": 565
    },
    {
      "epoch": 1.9256756756756757,
      "grad_norm": 0.2083408534526825,
      "learning_rate": 0.000194824879084405,
      "loss": 1.1446,
      "step": 570
    },
    {
      "epoch": 1.9425675675675675,
      "grad_norm": 0.2556820213794708,
      "learning_rate": 0.00019463600492698296,
      "loss": 1.1372,
      "step": 575
    },
    {
      "epoch": 1.9594594594594594,
      "grad_norm": 0.20939995348453522,
      "learning_rate": 0.00019444384053808288,
      "loss": 1.1421,
      "step": 580
    },
    {
      "epoch": 1.9763513513513513,
      "grad_norm": 0.2339630275964737,
      "learning_rate": 0.00019424839259872778,
      "loss": 1.1421,
      "step": 585
    },
    {
      "epoch": 1.9932432432432432,
      "grad_norm": 0.3135931193828583,
      "learning_rate": 0.00019404966790410047,
      "loss": 1.1339,
      "step": 590
    },
    {
      "epoch": 2.0,
      "eval_loss": 1.6897428035736084,
      "eval_runtime": 0.3945,
      "eval_samples_per_second": 5.07,
      "eval_steps_per_second": 2.535,
      "step": 592
    },
    {
      "epoch": 2.010135135135135,
      "grad_norm": 0.2158200591802597,
      "learning_rate": 0.0001938476733633076,
      "loss": 1.0977,
      "step": 595
    },
    {
      "epoch": 2.027027027027027,
      "grad_norm": 0.22781264781951904,
      "learning_rate": 0.00019364241599913924,
      "loss": 1.0711,
      "step": 600
    },
    {
      "epoch": 2.043918918918919,
      "grad_norm": 0.24521173536777496,
      "learning_rate": 0.0001934339029478248,
      "loss": 1.0767,
      "step": 605
    },
    {
      "epoch": 2.060810810810811,
      "grad_norm": 0.21851304173469543,
      "learning_rate": 0.00019322214145878487,
      "loss": 1.0549,
      "step": 610
    },
    {
      "epoch": 2.0777027027027026,
      "grad_norm": 0.21393460035324097,
      "learning_rate": 0.00019300713889437926,
      "loss": 1.068,
      "step": 615
    },
    {
      "epoch": 2.0945945945945947,
      "grad_norm": 0.23508517444133759,
      "learning_rate": 0.00019278890272965096,
      "loss": 1.0776,
      "step": 620
    },
    {
      "epoch": 2.1114864864864864,
      "grad_norm": 0.2709183990955353,
      "learning_rate": 0.00019256744055206622,
      "loss": 1.0867,
      "step": 625
    },
    {
      "epoch": 2.1283783783783785,
      "grad_norm": 0.22891944646835327,
      "learning_rate": 0.000192342760061251,
      "loss": 1.0719,
      "step": 630
    },
    {
      "epoch": 2.14527027027027,
      "grad_norm": 0.24709245562553406,
      "learning_rate": 0.0001921148690687228,
      "loss": 1.0687,
      "step": 635
    },
    {
      "epoch": 2.1621621621621623,
      "grad_norm": 0.2254343330860138,
      "learning_rate": 0.00019188377549761963,
      "loss": 1.0687,
      "step": 640
    },
    {
      "epoch": 2.179054054054054,
      "grad_norm": 0.22168201208114624,
      "learning_rate": 0.00019164948738242409,
      "loss": 1.0765,
      "step": 645
    },
    {
      "epoch": 2.195945945945946,
      "grad_norm": 0.23680733144283295,
      "learning_rate": 0.00019141201286868435,
      "loss": 1.0741,
      "step": 650
    },
    {
      "epoch": 2.2128378378378377,
      "grad_norm": 0.23159544169902802,
      "learning_rate": 0.00019117136021273075,
      "loss": 1.0795,
      "step": 655
    },
    {
      "epoch": 2.22972972972973,
      "grad_norm": 0.23217150568962097,
      "learning_rate": 0.00019092753778138886,
      "loss": 1.0947,
      "step": 660
    },
    {
      "epoch": 2.2466216216216215,
      "grad_norm": 0.22594888508319855,
      "learning_rate": 0.0001906805540516885,
      "loss": 1.059,
      "step": 665
    },
    {
      "epoch": 2.2635135135135136,
      "grad_norm": 0.23356075584888458,
      "learning_rate": 0.00019043041761056907,
      "loss": 1.084,
      "step": 670
    },
    {
      "epoch": 2.2804054054054053,
      "grad_norm": 0.21952542662620544,
      "learning_rate": 0.0001901771371545811,
      "loss": 1.0807,
      "step": 675
    },
    {
      "epoch": 2.2972972972972974,
      "grad_norm": 0.21846647560596466,
      "learning_rate": 0.00018992072148958368,
      "loss": 1.0878,
      "step": 680
    },
    {
      "epoch": 2.314189189189189,
      "grad_norm": 0.23093639314174652,
      "learning_rate": 0.00018966117953043852,
      "loss": 1.074,
      "step": 685
    },
    {
      "epoch": 2.331081081081081,
      "grad_norm": 0.224954292178154,
      "learning_rate": 0.00018939852030069981,
      "loss": 1.0784,
      "step": 690
    },
    {
      "epoch": 2.347972972972973,
      "grad_norm": 0.2606515884399414,
      "learning_rate": 0.00018913275293230069,
      "loss": 1.0757,
      "step": 695
    },
    {
      "epoch": 2.364864864864865,
      "grad_norm": 0.2542010247707367,
      "learning_rate": 0.0001888638866652356,
      "loss": 1.0705,
      "step": 700
    },
    {
      "epoch": 2.3817567567567566,
      "grad_norm": 0.2348444014787674,
      "learning_rate": 0.00018859193084723913,
      "loss": 1.0857,
      "step": 705
    },
    {
      "epoch": 2.3986486486486487,
      "grad_norm": 0.2732667922973633,
      "learning_rate": 0.00018831689493346095,
      "loss": 1.073,
      "step": 710
    },
    {
      "epoch": 2.4155405405405403,
      "grad_norm": 0.24476487934589386,
      "learning_rate": 0.00018803878848613716,
      "loss": 1.0862,
      "step": 715
    },
    {
      "epoch": 2.4324324324324325,
      "grad_norm": 0.25073671340942383,
      "learning_rate": 0.00018775762117425777,
      "loss": 1.0699,
      "step": 720
    },
    {
      "epoch": 2.4493243243243246,
      "grad_norm": 0.23084624111652374,
      "learning_rate": 0.0001874734027732306,
      "loss": 1.0827,
      "step": 725
    },
    {
      "epoch": 2.4662162162162162,
      "grad_norm": 0.2258080244064331,
      "learning_rate": 0.00018718614316454133,
      "loss": 1.088,
      "step": 730
    },
    {
      "epoch": 2.483108108108108,
      "grad_norm": 0.23056402802467346,
      "learning_rate": 0.00018689585233541003,
      "loss": 1.0698,
      "step": 735
    },
    {
      "epoch": 2.5,
      "grad_norm": 0.22269397974014282,
      "learning_rate": 0.00018660254037844388,
      "loss": 1.0666,
      "step": 740
    },
    {
      "epoch": 2.516891891891892,
      "grad_norm": 0.21295320987701416,
      "learning_rate": 0.0001863062174912863,
      "loss": 1.0781,
      "step": 745
    },
    {
      "epoch": 2.5337837837837838,
      "grad_norm": 0.21225321292877197,
      "learning_rate": 0.00018600689397626246,
      "loss": 1.0724,
      "step": 750
    },
    {
      "epoch": 2.5506756756756754,
      "grad_norm": 0.22661367058753967,
      "learning_rate": 0.00018570458024002093,
      "loss": 1.0792,
      "step": 755
    },
    {
      "epoch": 2.5675675675675675,
      "grad_norm": 0.22279423475265503,
      "learning_rate": 0.0001853992867931721,
      "loss": 1.082,
      "step": 760
    },
    {
      "epoch": 2.5844594594594597,
      "grad_norm": 0.22243249416351318,
      "learning_rate": 0.0001850910242499225,
      "loss": 1.0662,
      "step": 765
    },
    {
      "epoch": 2.6013513513513513,
      "grad_norm": 0.22147369384765625,
      "learning_rate": 0.00018477980332770607,
      "loss": 1.0718,
      "step": 770
    },
    {
      "epoch": 2.618243243243243,
      "grad_norm": 0.2354060411453247,
      "learning_rate": 0.00018446563484681127,
      "loss": 1.09,
      "step": 775
    },
    {
      "epoch": 2.635135135135135,
      "grad_norm": 0.24088838696479797,
      "learning_rate": 0.00018414852973000503,
      "loss": 1.0897,
      "step": 780
    },
    {
      "epoch": 2.652027027027027,
      "grad_norm": 0.2794990539550781,
      "learning_rate": 0.00018382849900215294,
      "loss": 1.0804,
      "step": 785
    },
    {
      "epoch": 2.668918918918919,
      "grad_norm": 0.25418001413345337,
      "learning_rate": 0.00018350555378983608,
      "loss": 1.0729,
      "step": 790
    },
    {
      "epoch": 2.685810810810811,
      "grad_norm": 0.2769224941730499,
      "learning_rate": 0.0001831797053209639,
      "loss": 1.0812,
      "step": 795
    },
    {
      "epoch": 2.7027027027027026,
      "grad_norm": 0.2639266550540924,
      "learning_rate": 0.00018285096492438424,
      "loss": 1.0841,
      "step": 800
    },
    {
      "epoch": 2.7195945945945947,
      "grad_norm": 0.21467705070972443,
      "learning_rate": 0.000182519344029489,
      "loss": 1.0852,
      "step": 805
    },
    {
      "epoch": 2.7364864864864864,
      "grad_norm": 0.22124196588993073,
      "learning_rate": 0.00018218485416581726,
      "loss": 1.0849,
      "step": 810
    },
    {
      "epoch": 2.7533783783783785,
      "grad_norm": 0.21145068109035492,
      "learning_rate": 0.00018184750696265408,
      "loss": 1.0706,
      "step": 815
    },
    {
      "epoch": 2.77027027027027,
      "grad_norm": 0.22575508058071136,
      "learning_rate": 0.00018150731414862622,
      "loss": 1.0737,
      "step": 820
    },
    {
      "epoch": 2.7871621621621623,
      "grad_norm": 0.22897441685199738,
      "learning_rate": 0.00018116428755129459,
      "loss": 1.076,
      "step": 825
    },
    {
      "epoch": 2.804054054054054,
      "grad_norm": 0.224187970161438,
      "learning_rate": 0.00018081843909674276,
      "loss": 1.075,
      "step": 830
    },
    {
      "epoch": 2.820945945945946,
      "grad_norm": 0.22817817330360413,
      "learning_rate": 0.00018046978080916252,
      "loss": 1.0802,
      "step": 835
    },
    {
      "epoch": 2.8378378378378377,
      "grad_norm": 0.23358392715454102,
      "learning_rate": 0.00018011832481043576,
      "loss": 1.073,
      "step": 840
    },
    {
      "epoch": 2.85472972972973,
      "grad_norm": 0.2256878912448883,
      "learning_rate": 0.00017976408331971298,
      "loss": 1.0712,
      "step": 845
    },
    {
      "epoch": 2.8716216216216215,
      "grad_norm": 0.2276696115732193,
      "learning_rate": 0.0001794070686529886,
      "loss": 1.0888,
      "step": 850
    },
    {
      "epoch": 2.8885135135135136,
      "grad_norm": 0.2123207151889801,
      "learning_rate": 0.00017904729322267256,
      "loss": 1.0856,
      "step": 855
    },
    {
      "epoch": 2.9054054054054053,
      "grad_norm": 0.2253648340702057,
      "learning_rate": 0.000178684769537159,
      "loss": 1.0769,
      "step": 860
    },
    {
      "epoch": 2.9222972972972974,
      "grad_norm": 0.23328694701194763,
      "learning_rate": 0.00017831951020039126,
      "loss": 1.0805,
      "step": 865
    },
    {
      "epoch": 2.939189189189189,
      "grad_norm": 0.2189178615808487,
      "learning_rate": 0.0001779515279114236,
      "loss": 1.083,
      "step": 870
    },
    {
      "epoch": 2.956081081081081,
      "grad_norm": 0.21634751558303833,
      "learning_rate": 0.0001775808354639799,
      "loss": 1.0777,
      "step": 875
    },
    {
      "epoch": 2.972972972972973,
      "grad_norm": 0.22920973598957062,
      "learning_rate": 0.00017720744574600863,
      "loss": 1.0622,
      "step": 880
    },
    {
      "epoch": 2.989864864864865,
      "grad_norm": 0.23738548159599304,
      "learning_rate": 0.00017683137173923495,
      "loss": 1.0779,
      "step": 885
    },
    {
      "epoch": 3.0,
      "eval_loss": 1.7535914182662964,
      "eval_runtime": 0.3941,
      "eval_samples_per_second": 5.075,
      "eval_steps_per_second": 2.537,
      "step": 888
    },
    {
      "epoch": 3.0067567567567566,
      "grad_norm": 0.22737225890159607,
      "learning_rate": 0.00017645262651870926,
      "loss": 1.0427,
      "step": 890
    },
    {
      "epoch": 3.0236486486486487,
      "grad_norm": 0.2775098383426666,
      "learning_rate": 0.00017607122325235267,
      "loss": 0.9853,
      "step": 895
    },
    {
      "epoch": 3.0405405405405403,
      "grad_norm": 0.2837352752685547,
      "learning_rate": 0.0001756871752004992,
      "loss": 0.9753,
      "step": 900
    },
    {
      "epoch": 3.0574324324324325,
      "grad_norm": 0.25329145789146423,
      "learning_rate": 0.00017530049571543464,
      "loss": 0.9845,
      "step": 905
    },
    {
      "epoch": 3.074324324324324,
      "grad_norm": 0.2581470310688019,
      "learning_rate": 0.0001749111982409325,
      "loss": 0.974,
      "step": 910
    },
    {
      "epoch": 3.0912162162162162,
      "grad_norm": 0.2744286358356476,
      "learning_rate": 0.00017451929631178648,
      "loss": 0.9777,
      "step": 915
    },
    {
      "epoch": 3.108108108108108,
      "grad_norm": 0.2783578038215637,
      "learning_rate": 0.00017412480355334005,
      "loss": 0.9883,
      "step": 920
    },
    {
      "epoch": 3.125,
      "grad_norm": 0.27584517002105713,
      "learning_rate": 0.0001737277336810124,
      "loss": 0.98,
      "step": 925
    },
    {
      "epoch": 3.141891891891892,
      "grad_norm": 0.26467305421829224,
      "learning_rate": 0.00017332810049982208,
      "loss": 0.9956,
      "step": 930
    },
    {
      "epoch": 3.1587837837837838,
      "grad_norm": 0.25240039825439453,
      "learning_rate": 0.00017292591790390665,
      "loss": 0.9933,
      "step": 935
    },
    {
      "epoch": 3.175675675675676,
      "grad_norm": 0.24769380688667297,
      "learning_rate": 0.00017252119987603973,
      "loss": 0.9742,
      "step": 940
    },
    {
      "epoch": 3.1925675675675675,
      "grad_norm": 0.27298596501350403,
      "learning_rate": 0.00017211396048714498,
      "loss": 0.9866,
      "step": 945
    },
    {
      "epoch": 3.2094594594594597,
      "grad_norm": 0.2657850682735443,
      "learning_rate": 0.00017170421389580667,
      "loss": 0.99,
      "step": 950
    },
    {
      "epoch": 3.2263513513513513,
      "grad_norm": 0.23783531785011292,
      "learning_rate": 0.00017129197434777763,
      "loss": 0.9891,
      "step": 955
    },
    {
      "epoch": 3.2432432432432434,
      "grad_norm": 0.24934813380241394,
      "learning_rate": 0.00017087725617548385,
      "loss": 0.9986,
      "step": 960
    },
    {
      "epoch": 3.260135135135135,
      "grad_norm": 0.265461802482605,
      "learning_rate": 0.0001704600737975262,
      "loss": 0.977,
      "step": 965
    },
    {
      "epoch": 3.277027027027027,
      "grad_norm": 0.26984909176826477,
      "learning_rate": 0.00017004044171817925,
      "loss": 1.0041,
      "step": 970
    },
    {
      "epoch": 3.293918918918919,
      "grad_norm": 0.26064538955688477,
      "learning_rate": 0.00016961837452688676,
      "loss": 1.0007,
      "step": 975
    },
    {
      "epoch": 3.310810810810811,
      "grad_norm": 0.253579705953598,
      "learning_rate": 0.00016919388689775464,
      "loss": 1.0069,
      "step": 980
    },
    {
      "epoch": 3.3277027027027026,
      "grad_norm": 0.26410114765167236,
      "learning_rate": 0.00016876699358904068,
      "loss": 1.004,
      "step": 985
    },
    {
      "epoch": 3.3445945945945947,
      "grad_norm": 0.2758503556251526,
      "learning_rate": 0.00016833770944264153,
      "loss": 1.0048,
      "step": 990
    },
    {
      "epoch": 3.3614864864864864,
      "grad_norm": 0.2595711648464203,
      "learning_rate": 0.00016790604938357663,
      "loss": 0.9929,
      "step": 995
    },
    {
      "epoch": 3.3783783783783785,
      "grad_norm": 0.26039746403694153,
      "learning_rate": 0.00016747202841946928,
      "loss": 1.0006,
      "step": 1000
    },
    {
      "epoch": 3.39527027027027,
      "grad_norm": 0.25514382123947144,
      "learning_rate": 0.0001670356616400249,
      "loss": 1.012,
      "step": 1005
    },
    {
      "epoch": 3.4121621621621623,
      "grad_norm": 0.26591041684150696,
      "learning_rate": 0.00016659696421650645,
      "loss": 1.0039,
      "step": 1010
    },
    {
      "epoch": 3.429054054054054,
      "grad_norm": 0.26443612575531006,
      "learning_rate": 0.00016615595140120686,
      "loss": 0.9982,
      "step": 1015
    },
    {
      "epoch": 3.445945945945946,
      "grad_norm": 0.2647687792778015,
      "learning_rate": 0.00016571263852691888,
      "loss": 1.0028,
      "step": 1020
    },
    {
      "epoch": 3.4628378378378377,
      "grad_norm": 0.2620026767253876,
      "learning_rate": 0.0001652670410064019,
      "loss": 0.9951,
      "step": 1025
    },
    {
      "epoch": 3.47972972972973,
      "grad_norm": 0.2619130313396454,
      "learning_rate": 0.00016481917433184607,
      "loss": 0.9882,
      "step": 1030
    },
    {
      "epoch": 3.4966216216216215,
      "grad_norm": 0.24988499283790588,
      "learning_rate": 0.0001643690540743339,
      "loss": 0.9958,
      "step": 1035
    },
    {
      "epoch": 3.5135135135135136,
      "grad_norm": 0.2864786982536316,
      "learning_rate": 0.0001639166958832985,
      "loss": 1.0017,
      "step": 1040
    },
    {
      "epoch": 3.5304054054054053,
      "grad_norm": 0.2665320038795471,
      "learning_rate": 0.00016346211548597995,
      "loss": 0.9994,
      "step": 1045
    },
    {
      "epoch": 3.5472972972972974,
      "grad_norm": 0.2629227936267853,
      "learning_rate": 0.00016300532868687806,
      "loss": 1.007,
      "step": 1050
    },
    {
      "epoch": 3.564189189189189,
      "grad_norm": 0.25602978467941284,
      "learning_rate": 0.00016254635136720328,
      "loss": 1.0057,
      "step": 1055
    },
    {
      "epoch": 3.581081081081081,
      "grad_norm": 0.2551196813583374,
      "learning_rate": 0.0001620851994843244,
      "loss": 0.9972,
      "step": 1060
    },
    {
      "epoch": 3.597972972972973,
      "grad_norm": 0.27250906825065613,
      "learning_rate": 0.00016162188907121354,
      "loss": 1.0075,
      "step": 1065
    },
    {
      "epoch": 3.614864864864865,
      "grad_norm": 0.2675882577896118,
      "learning_rate": 0.00016115643623588915,
      "loss": 1.0103,
      "step": 1070
    },
    {
      "epoch": 3.631756756756757,
      "grad_norm": 0.2731866240501404,
      "learning_rate": 0.00016068885716085567,
      "loss": 1.0016,
      "step": 1075
    },
    {
      "epoch": 3.6486486486486487,
      "grad_norm": 0.249202698469162,
      "learning_rate": 0.00016021916810254097,
      "loss": 1.0086,
      "step": 1080
    },
    {
      "epoch": 3.6655405405405403,
      "grad_norm": 0.2600172460079193,
      "learning_rate": 0.00015974738539073125,
      "loss": 1.0032,
      "step": 1085
    },
    {
      "epoch": 3.6824324324324325,
      "grad_norm": 0.2564319372177124,
      "learning_rate": 0.00015927352542800317,
      "loss": 1.0087,
      "step": 1090
    },
    {
      "epoch": 3.6993243243243246,
      "grad_norm": 0.25873422622680664,
      "learning_rate": 0.00015879760468915372,
      "loss": 1.0006,
      "step": 1095
    },
    {
      "epoch": 3.7162162162162162,
      "grad_norm": 0.2660174071788788,
      "learning_rate": 0.00015831963972062733,
      "loss": 0.988,
      "step": 1100
    },
    {
      "epoch": 3.733108108108108,
      "grad_norm": 0.26095345616340637,
      "learning_rate": 0.0001578396471399406,
      "loss": 1.0109,
      "step": 1105
    },
    {
      "epoch": 3.75,
      "grad_norm": 0.2525663673877716,
      "learning_rate": 0.0001573576436351046,
      "loss": 1.001,
      "step": 1110
    },
    {
      "epoch": 3.766891891891892,
      "grad_norm": 0.2541150152683258,
      "learning_rate": 0.0001568736459640447,
      "loss": 0.9995,
      "step": 1115
    },
    {
      "epoch": 3.7837837837837838,
      "grad_norm": 0.2548198997974396,
      "learning_rate": 0.0001563876709540178,
      "loss": 1.007,
      "step": 1120
    },
    {
      "epoch": 3.8006756756756754,
      "grad_norm": 0.26351451873779297,
      "learning_rate": 0.00015589973550102747,
      "loss": 1.0056,
      "step": 1125
    },
    {
      "epoch": 3.8175675675675675,
      "grad_norm": 0.2661518454551697,
      "learning_rate": 0.00015540985656923645,
      "loss": 1.0159,
      "step": 1130
    },
    {
      "epoch": 3.8344594594594597,
      "grad_norm": 0.2599773406982422,
      "learning_rate": 0.00015491805119037684,
      "loss": 1.0102,
      "step": 1135
    },
    {
      "epoch": 3.8513513513513513,
      "grad_norm": 0.2605207562446594,
      "learning_rate": 0.0001544243364631579,
      "loss": 1.009,
      "step": 1140
    },
    {
      "epoch": 3.868243243243243,
      "grad_norm": 0.2640506625175476,
      "learning_rate": 0.00015392872955267175,
      "loss": 1.0125,
      "step": 1145
    },
    {
      "epoch": 3.885135135135135,
      "grad_norm": 0.29407069087028503,
      "learning_rate": 0.00015343124768979637,
      "loss": 1.0107,
      "step": 1150
    },
    {
      "epoch": 3.902027027027027,
      "grad_norm": 0.2638514041900635,
      "learning_rate": 0.00015293190817059667,
      "loss": 1.0046,
      "step": 1155
    },
    {
      "epoch": 3.918918918918919,
      "grad_norm": 0.26569753885269165,
      "learning_rate": 0.00015243072835572318,
      "loss": 0.9985,
      "step": 1160
    },
    {
      "epoch": 3.935810810810811,
      "grad_norm": 0.24786274135112762,
      "learning_rate": 0.0001519277256698083,
      "loss": 1.0086,
      "step": 1165
    },
    {
      "epoch": 3.9527027027027026,
      "grad_norm": 0.27254632115364075,
      "learning_rate": 0.0001514229176008607,
      "loss": 1.0048,
      "step": 1170
    },
    {
      "epoch": 3.9695945945945947,
      "grad_norm": 0.26518264412879944,
      "learning_rate": 0.0001509163216996572,
      "loss": 1.0014,
      "step": 1175
    },
    {
      "epoch": 3.9864864864864864,
      "grad_norm": 0.24938583374023438,
      "learning_rate": 0.00015040795557913245,
      "loss": 1.0043,
      "step": 1180
    },
    {
      "epoch": 4.0,
      "eval_loss": 1.8225109577178955,
      "eval_runtime": 0.3942,
      "eval_samples_per_second": 5.073,
      "eval_steps_per_second": 2.537,
      "step": 1184
    },
    {
      "epoch": 4.003378378378378,
      "grad_norm": 0.41594985127449036,
      "learning_rate": 0.00014989783691376696,
      "loss": 0.9886,
      "step": 1185
    },
    {
      "epoch": 4.02027027027027,
      "grad_norm": 0.332119345664978,
      "learning_rate": 0.00014938598343897214,
      "loss": 0.8971,
      "step": 1190
    },
    {
      "epoch": 4.037162162162162,
      "grad_norm": 0.2723919749259949,
      "learning_rate": 0.000148872412950474,
      "loss": 0.9054,
      "step": 1195
    },
    {
      "epoch": 4.054054054054054,
      "grad_norm": 0.3006138801574707,
      "learning_rate": 0.00014835714330369446,
      "loss": 0.8955,
      "step": 1200
    },
    {
      "epoch": 4.070945945945946,
      "grad_norm": 0.3039803206920624,
      "learning_rate": 0.00014784019241313026,
      "loss": 0.8937,
      "step": 1205
    },
    {
      "epoch": 4.087837837837838,
      "grad_norm": 0.2896163761615753,
      "learning_rate": 0.00014732157825173044,
      "loss": 0.8998,
      "step": 1210
    },
    {
      "epoch": 4.10472972972973,
      "grad_norm": 0.2962886095046997,
      "learning_rate": 0.00014680131885027141,
      "loss": 0.9087,
      "step": 1215
    },
    {
      "epoch": 4.121621621621622,
      "grad_norm": 0.2953561246395111,
      "learning_rate": 0.0001462794322967299,
      "loss": 0.9078,
      "step": 1220
    },
    {
      "epoch": 4.138513513513513,
      "grad_norm": 0.2991558015346527,
      "learning_rate": 0.00014575593673565426,
      "loss": 0.9004,
      "step": 1225
    },
    {
      "epoch": 4.155405405405405,
      "grad_norm": 0.32434654235839844,
      "learning_rate": 0.00014523085036753354,
      "loss": 0.8972,
      "step": 1230
    },
    {
      "epoch": 4.172297297297297,
      "grad_norm": 0.29733654856681824,
      "learning_rate": 0.00014470419144816483,
      "loss": 0.905,
      "step": 1235
    },
    {
      "epoch": 4.1891891891891895,
      "grad_norm": 0.2878667116165161,
      "learning_rate": 0.00014417597828801832,
      "loss": 0.9037,
      "step": 1240
    },
    {
      "epoch": 4.206081081081081,
      "grad_norm": 0.3089180886745453,
      "learning_rate": 0.00014364622925160098,
      "loss": 0.9004,
      "step": 1245
    },
    {
      "epoch": 4.222972972972973,
      "grad_norm": 0.29691433906555176,
      "learning_rate": 0.00014311496275681783,
      "loss": 0.9105,
      "step": 1250
    },
    {
      "epoch": 4.239864864864865,
      "grad_norm": 0.31907522678375244,
      "learning_rate": 0.0001425821972743318,
      "loss": 0.9051,
      "step": 1255
    },
    {
      "epoch": 4.256756756756757,
      "grad_norm": 0.3177861273288727,
      "learning_rate": 0.00014204795132692144,
      "loss": 0.9059,
      "step": 1260
    },
    {
      "epoch": 4.273648648648648,
      "grad_norm": 0.3413095474243164,
      "learning_rate": 0.00014151224348883692,
      "loss": 0.9068,
      "step": 1265
    },
    {
      "epoch": 4.29054054054054,
      "grad_norm": 0.31278854608535767,
      "learning_rate": 0.00014097509238515432,
      "loss": 0.9178,
      "step": 1270
    },
    {
      "epoch": 4.3074324324324325,
      "grad_norm": 0.3215930461883545,
      "learning_rate": 0.00014043651669112808,
      "loss": 0.9048,
      "step": 1275
    },
    {
      "epoch": 4.324324324324325,
      "grad_norm": 0.32147011160850525,
      "learning_rate": 0.00013989653513154165,
      "loss": 0.9182,
      "step": 1280
    },
    {
      "epoch": 4.341216216216216,
      "grad_norm": 0.30455154180526733,
      "learning_rate": 0.0001393551664800566,
      "loss": 0.9159,
      "step": 1285
    },
    {
      "epoch": 4.358108108108108,
      "grad_norm": 0.310214638710022,
      "learning_rate": 0.00013881242955855974,
      "loss": 0.9157,
      "step": 1290
    },
    {
      "epoch": 4.375,
      "grad_norm": 0.3040444254875183,
      "learning_rate": 0.000138268343236509,
      "loss": 0.9136,
      "step": 1295
    },
    {
      "epoch": 4.391891891891892,
      "grad_norm": 0.32138949632644653,
      "learning_rate": 0.000137722926430277,
      "loss": 0.9198,
      "step": 1300
    },
    {
      "epoch": 4.408783783783784,
      "grad_norm": 0.3029273748397827,
      "learning_rate": 0.00013717619810249378,
      "loss": 0.9207,
      "step": 1305
    },
    {
      "epoch": 4.425675675675675,
      "grad_norm": 0.3084327280521393,
      "learning_rate": 0.00013662817726138728,
      "loss": 0.9128,
      "step": 1310
    },
    {
      "epoch": 4.4425675675675675,
      "grad_norm": 0.2980863153934479,
      "learning_rate": 0.00013607888296012259,
      "loss": 0.919,
      "step": 1315
    },
    {
      "epoch": 4.45945945945946,
      "grad_norm": 0.3012111186981201,
      "learning_rate": 0.00013552833429613938,
      "loss": 0.913,
      "step": 1320
    },
    {
      "epoch": 4.476351351351352,
      "grad_norm": 0.3067188262939453,
      "learning_rate": 0.0001349765504104881,
      "loss": 0.9098,
      "step": 1325
    },
    {
      "epoch": 4.493243243243243,
      "grad_norm": 0.30859634280204773,
      "learning_rate": 0.0001344235504871645,
      "loss": 0.9103,
      "step": 1330
    },
    {
      "epoch": 4.510135135135135,
      "grad_norm": 0.309527724981308,
      "learning_rate": 0.00013386935375244246,
      "loss": 0.9118,
      "step": 1335
    },
    {
      "epoch": 4.527027027027027,
      "grad_norm": 0.29956597089767456,
      "learning_rate": 0.00013331397947420576,
      "loss": 0.9248,
      "step": 1340
    },
    {
      "epoch": 4.543918918918919,
      "grad_norm": 0.30333107709884644,
      "learning_rate": 0.00013275744696127805,
      "loss": 0.9235,
      "step": 1345
    },
    {
      "epoch": 4.5608108108108105,
      "grad_norm": 0.3010920584201813,
      "learning_rate": 0.00013219977556275163,
      "loss": 0.9204,
      "step": 1350
    },
    {
      "epoch": 4.577702702702703,
      "grad_norm": 0.30947473645210266,
      "learning_rate": 0.00013164098466731468,
      "loss": 0.9244,
      "step": 1355
    },
    {
      "epoch": 4.594594594594595,
      "grad_norm": 0.30661630630493164,
      "learning_rate": 0.00013108109370257712,
      "loss": 0.9177,
      "step": 1360
    },
    {
      "epoch": 4.611486486486487,
      "grad_norm": 0.2866823971271515,
      "learning_rate": 0.00013052012213439536,
      "loss": 0.9107,
      "step": 1365
    },
    {
      "epoch": 4.628378378378378,
      "grad_norm": 0.3211285471916199,
      "learning_rate": 0.0001299580894661953,
      "loss": 0.9242,
      "step": 1370
    },
    {
      "epoch": 4.64527027027027,
      "grad_norm": 0.3097619414329529,
      "learning_rate": 0.00012939501523829444,
      "loss": 0.91,
      "step": 1375
    },
    {
      "epoch": 4.662162162162162,
      "grad_norm": 0.30498236417770386,
      "learning_rate": 0.0001288309190272222,
      "loss": 0.9176,
      "step": 1380
    },
    {
      "epoch": 4.679054054054054,
      "grad_norm": 0.31782612204551697,
      "learning_rate": 0.00012826582044503978,
      "loss": 0.91,
      "step": 1385
    },
    {
      "epoch": 4.695945945945946,
      "grad_norm": 0.32527872920036316,
      "learning_rate": 0.00012769973913865794,
      "loss": 0.9119,
      "step": 1390
    },
    {
      "epoch": 4.712837837837838,
      "grad_norm": 0.2965739369392395,
      "learning_rate": 0.000127132694789154,
      "loss": 0.9333,
      "step": 1395
    },
    {
      "epoch": 4.72972972972973,
      "grad_norm": 0.31443119049072266,
      "learning_rate": 0.00012656470711108764,
      "loss": 0.9184,
      "step": 1400
    },
    {
      "epoch": 4.746621621621622,
      "grad_norm": 0.30386343598365784,
      "learning_rate": 0.00012599579585181552,
      "loss": 0.912,
      "step": 1405
    },
    {
      "epoch": 4.763513513513513,
      "grad_norm": 0.2971736788749695,
      "learning_rate": 0.00012542598079080456,
      "loss": 0.9115,
      "step": 1410
    },
    {
      "epoch": 4.780405405405405,
      "grad_norm": 0.29560431838035583,
      "learning_rate": 0.00012485528173894448,
      "loss": 0.9176,
      "step": 1415
    },
    {
      "epoch": 4.797297297297297,
      "grad_norm": 0.30718737840652466,
      "learning_rate": 0.0001242837185378587,
      "loss": 0.9184,
      "step": 1420
    },
    {
      "epoch": 4.8141891891891895,
      "grad_norm": 0.29568740725517273,
      "learning_rate": 0.00012371131105921504,
      "loss": 0.9214,
      "step": 1425
    },
    {
      "epoch": 4.831081081081081,
      "grad_norm": 0.32252946496009827,
      "learning_rate": 0.00012313807920403419,
      "loss": 0.9252,
      "step": 1430
    },
    {
      "epoch": 4.847972972972973,
      "grad_norm": 0.31315141916275024,
      "learning_rate": 0.00012256404290199825,
      "loss": 0.9308,
      "step": 1435
    },
    {
      "epoch": 4.864864864864865,
      "grad_norm": 0.3065871000289917,
      "learning_rate": 0.00012198922211075778,
      "loss": 0.9186,
      "step": 1440
    },
    {
      "epoch": 4.881756756756757,
      "grad_norm": 0.31804540753364563,
      "learning_rate": 0.00012141363681523776,
      "loss": 0.9275,
      "step": 1445
    },
    {
      "epoch": 4.898648648648649,
      "grad_norm": 0.313486784696579,
      "learning_rate": 0.00012083730702694291,
      "loss": 0.9315,
      "step": 1450
    },
    {
      "epoch": 4.91554054054054,
      "grad_norm": 0.31312400102615356,
      "learning_rate": 0.00012026025278326187,
      "loss": 0.934,
      "step": 1455
    },
    {
      "epoch": 4.9324324324324325,
      "grad_norm": 0.321845680475235,
      "learning_rate": 0.00011968249414677055,
      "loss": 0.9266,
      "step": 1460
    },
    {
      "epoch": 4.949324324324325,
      "grad_norm": 0.29238423705101013,
      "learning_rate": 0.00011910405120453476,
      "loss": 0.9203,
      "step": 1465
    },
    {
      "epoch": 4.966216216216216,
      "grad_norm": 0.30449482798576355,
      "learning_rate": 0.00011852494406741165,
      "loss": 0.9254,
      "step": 1470
    },
    {
      "epoch": 4.983108108108108,
      "grad_norm": 0.3126208186149597,
      "learning_rate": 0.00011794519286935055,
      "loss": 0.9181,
      "step": 1475
    },
    {
      "epoch": 5.0,
      "grad_norm": 0.29170361161231995,
      "learning_rate": 0.00011736481776669306,
      "loss": 0.9288,
      "step": 1480
    },
    {
      "epoch": 5.0,
      "eval_loss": 2.0044448375701904,
      "eval_runtime": 0.3932,
      "eval_samples_per_second": 5.087,
      "eval_steps_per_second": 2.543,
      "step": 1480
    },
    {
      "epoch": 5.016891891891892,
      "grad_norm": 0.46076056361198425,
      "learning_rate": 0.0001167838389374722,
      "loss": 0.8221,
      "step": 1485
    },
    {
      "epoch": 5.033783783783784,
      "grad_norm": 0.32739222049713135,
      "learning_rate": 0.00011620227658071087,
      "loss": 0.8178,
      "step": 1490
    },
    {
      "epoch": 5.050675675675675,
      "grad_norm": 0.38803204894065857,
      "learning_rate": 0.00011562015091571963,
      "loss": 0.8143,
      "step": 1495
    },
    {
      "epoch": 5.0675675675675675,
      "grad_norm": 0.32274121046066284,
      "learning_rate": 0.00011503748218139369,
      "loss": 0.821,
      "step": 1500
    },
    {
      "epoch": 5.08445945945946,
      "grad_norm": 0.3647359013557434,
      "learning_rate": 0.00011445429063550926,
      "loss": 0.8265,
      "step": 1505
    },
    {
      "epoch": 5.101351351351352,
      "grad_norm": 0.36681613326072693,
      "learning_rate": 0.00011387059655401932,
      "loss": 0.8248,
      "step": 1510
    },
    {
      "epoch": 5.118243243243243,
      "grad_norm": 0.35085347294807434,
      "learning_rate": 0.00011328642023034857,
      "loss": 0.823,
      "step": 1515
    },
    {
      "epoch": 5.135135135135135,
      "grad_norm": 0.3212147653102875,
      "learning_rate": 0.00011270178197468789,
      "loss": 0.8265,
      "step": 1520
    },
    {
      "epoch": 5.152027027027027,
      "grad_norm": 0.35389629006385803,
      "learning_rate": 0.00011211670211328833,
      "loss": 0.8252,
      "step": 1525
    },
    {
      "epoch": 5.168918918918919,
      "grad_norm": 0.350277841091156,
      "learning_rate": 0.00011153120098775434,
      "loss": 0.8193,
      "step": 1530
    },
    {
      "epoch": 5.1858108108108105,
      "grad_norm": 0.35216981172561646,
      "learning_rate": 0.00011094529895433652,
      "loss": 0.8291,
      "step": 1535
    },
    {
      "epoch": 5.202702702702703,
      "grad_norm": 0.33077818155288696,
      "learning_rate": 0.00011035901638322392,
      "loss": 0.8145,
      "step": 1540
    },
    {
      "epoch": 5.219594594594595,
      "grad_norm": 0.34553956985473633,
      "learning_rate": 0.0001097723736578359,
      "loss": 0.8297,
      "step": 1545
    },
    {
      "epoch": 5.236486486486487,
      "grad_norm": 0.349026083946228,
      "learning_rate": 0.00010918539117411333,
      "loss": 0.8363,
      "step": 1550
    },
    {
      "epoch": 5.253378378378378,
      "grad_norm": 0.34249648451805115,
      "learning_rate": 0.00010859808933980948,
      "loss": 0.8228,
      "step": 1555
    },
    {
      "epoch": 5.27027027027027,
      "grad_norm": 0.3591874837875366,
      "learning_rate": 0.00010801048857378071,
      "loss": 0.8272,
      "step": 1560
    },
    {
      "epoch": 5.287162162162162,
      "grad_norm": 0.3266925513744354,
      "learning_rate": 0.00010742260930527625,
      "loss": 0.8264,
      "step": 1565
    },
    {
      "epoch": 5.304054054054054,
      "grad_norm": 0.3557049632072449,
      "learning_rate": 0.00010683447197322817,
      "loss": 0.8327,
      "step": 1570
    },
    {
      "epoch": 5.320945945945946,
      "grad_norm": 0.34309855103492737,
      "learning_rate": 0.00010624609702554069,
      "loss": 0.8362,
      "step": 1575
    },
    {
      "epoch": 5.337837837837838,
      "grad_norm": 0.33597272634506226,
      "learning_rate": 0.00010565750491837925,
      "loss": 0.8274,
      "step": 1580
    },
    {
      "epoch": 5.35472972972973,
      "grad_norm": 0.33070334792137146,
      "learning_rate": 0.0001050687161154593,
      "loss": 0.8309,
      "step": 1585
    },
    {
      "epoch": 5.371621621621622,
      "grad_norm": 0.34598931670188904,
      "learning_rate": 0.00010447975108733492,
      "loss": 0.846,
      "step": 1590
    },
    {
      "epoch": 5.388513513513513,
      "grad_norm": 0.3528457283973694,
      "learning_rate": 0.00010389063031068698,
      "loss": 0.8199,
      "step": 1595
    },
    {
      "epoch": 5.405405405405405,
      "grad_norm": 0.3506796956062317,
      "learning_rate": 0.00010330137426761135,
      "loss": 0.8377,
      "step": 1600
    },
    {
      "epoch": 5.422297297297297,
      "grad_norm": 0.35415780544281006,
      "learning_rate": 0.00010271200344490674,
      "loss": 0.8357,
      "step": 1605
    },
    {
      "epoch": 5.4391891891891895,
      "grad_norm": 0.33977410197257996,
      "learning_rate": 0.00010212253833336237,
      "loss": 0.8273,
      "step": 1610
    },
    {
      "epoch": 5.456081081081081,
      "grad_norm": 0.3760969638824463,
      "learning_rate": 0.00010153299942704566,
      "loss": 0.8404,
      "step": 1615
    },
    {
      "epoch": 5.472972972972973,
      "grad_norm": 0.3504043519496918,
      "learning_rate": 0.00010094340722258969,
      "loss": 0.8368,
      "step": 1620
    },
    {
      "epoch": 5.489864864864865,
      "grad_norm": 0.3397385776042938,
      "learning_rate": 0.00010035378221848053,
      "loss": 0.8327,
      "step": 1625
    },
    {
      "epoch": 5.506756756756757,
      "grad_norm": 0.33861246705055237,
      "learning_rate": 9.976414491434463e-05,
      "loss": 0.8419,
      "step": 1630
    },
    {
      "epoch": 5.523648648648649,
      "grad_norm": 0.3566323220729828,
      "learning_rate": 9.917451581023607e-05,
      "loss": 0.8366,
      "step": 1635
    },
    {
      "epoch": 5.54054054054054,
      "grad_norm": 0.3398774266242981,
      "learning_rate": 9.858491540592382e-05,
      "loss": 0.8306,
      "step": 1640
    },
    {
      "epoch": 5.5574324324324325,
      "grad_norm": 0.3483969271183014,
      "learning_rate": 9.799536420017906e-05,
      "loss": 0.8333,
      "step": 1645
    },
    {
      "epoch": 5.574324324324325,
      "grad_norm": 0.34190595149993896,
      "learning_rate": 9.740588269006246e-05,
      "loss": 0.838,
      "step": 1650
    },
    {
      "epoch": 5.591216216216216,
      "grad_norm": 0.35382217168807983,
      "learning_rate": 9.681649137021158e-05,
      "loss": 0.8489,
      "step": 1655
    },
    {
      "epoch": 5.608108108108108,
      "grad_norm": 0.3321906328201294,
      "learning_rate": 9.622721073212832e-05,
      "loss": 0.8364,
      "step": 1660
    },
    {
      "epoch": 5.625,
      "grad_norm": 0.34170404076576233,
      "learning_rate": 9.563806126346642e-05,
      "loss": 0.841,
      "step": 1665
    },
    {
      "epoch": 5.641891891891892,
      "grad_norm": 0.34292900562286377,
      "learning_rate": 9.504906344731932e-05,
      "loss": 0.8366,
      "step": 1670
    },
    {
      "epoch": 5.658783783783784,
      "grad_norm": 0.35314562916755676,
      "learning_rate": 9.446023776150787e-05,
      "loss": 0.838,
      "step": 1675
    },
    {
      "epoch": 5.675675675675675,
      "grad_norm": 0.3411477506160736,
      "learning_rate": 9.38716046778684e-05,
      "loss": 0.8441,
      "step": 1680
    },
    {
      "epoch": 5.6925675675675675,
      "grad_norm": 0.3432328701019287,
      "learning_rate": 9.328318466154102e-05,
      "loss": 0.8459,
      "step": 1685
    },
    {
      "epoch": 5.70945945945946,
      "grad_norm": 0.33872732520103455,
      "learning_rate": 9.269499817025814e-05,
      "loss": 0.8388,
      "step": 1690
    },
    {
      "epoch": 5.726351351351351,
      "grad_norm": 0.34312689304351807,
      "learning_rate": 9.210706565363305e-05,
      "loss": 0.8332,
      "step": 1695
    },
    {
      "epoch": 5.743243243243243,
      "grad_norm": 0.3369201123714447,
      "learning_rate": 9.151940755244912e-05,
      "loss": 0.831,
      "step": 1700
    },
    {
      "epoch": 5.760135135135135,
      "grad_norm": 0.34367725253105164,
      "learning_rate": 9.093204429794898e-05,
      "loss": 0.8303,
      "step": 1705
    },
    {
      "epoch": 5.777027027027027,
      "grad_norm": 0.3678775727748871,
      "learning_rate": 9.034499631112437e-05,
      "loss": 0.8413,
      "step": 1710
    },
    {
      "epoch": 5.793918918918919,
      "grad_norm": 0.34643349051475525,
      "learning_rate": 8.975828400200592e-05,
      "loss": 0.845,
      "step": 1715
    },
    {
      "epoch": 5.8108108108108105,
      "grad_norm": 0.35629916191101074,
      "learning_rate": 8.917192776895382e-05,
      "loss": 0.836,
      "step": 1720
    },
    {
      "epoch": 5.827702702702703,
      "grad_norm": 0.3395968973636627,
      "learning_rate": 8.858594799794835e-05,
      "loss": 0.8384,
      "step": 1725
    },
    {
      "epoch": 5.844594594594595,
      "grad_norm": 0.3399130403995514,
      "learning_rate": 8.800036506188129e-05,
      "loss": 0.841,
      "step": 1730
    },
    {
      "epoch": 5.861486486486487,
      "grad_norm": 0.3563048541545868,
      "learning_rate": 8.741519931984766e-05,
      "loss": 0.8388,
      "step": 1735
    },
    {
      "epoch": 5.878378378378378,
      "grad_norm": 0.34680601954460144,
      "learning_rate": 8.683047111643763e-05,
      "loss": 0.8368,
      "step": 1740
    },
    {
      "epoch": 5.89527027027027,
      "grad_norm": 0.3650359511375427,
      "learning_rate": 8.624620078102951e-05,
      "loss": 0.8447,
      "step": 1745
    },
    {
      "epoch": 5.912162162162162,
      "grad_norm": 0.34037554264068604,
      "learning_rate": 8.566240862708274e-05,
      "loss": 0.8355,
      "step": 1750
    },
    {
      "epoch": 5.929054054054054,
      "grad_norm": 0.35734692215919495,
      "learning_rate": 8.507911495143173e-05,
      "loss": 0.8425,
      "step": 1755
    },
    {
      "epoch": 5.945945945945946,
      "grad_norm": 0.3381343483924866,
      "learning_rate": 8.449634003358022e-05,
      "loss": 0.8418,
      "step": 1760
    },
    {
      "epoch": 5.962837837837838,
      "grad_norm": 0.3489098846912384,
      "learning_rate": 8.39141041349961e-05,
      "loss": 0.847,
      "step": 1765
    },
    {
      "epoch": 5.97972972972973,
      "grad_norm": 0.361604243516922,
      "learning_rate": 8.33324274984071e-05,
      "loss": 0.8428,
      "step": 1770
    },
    {
      "epoch": 5.996621621621622,
      "grad_norm": 0.34529900550842285,
      "learning_rate": 8.275133034709699e-05,
      "loss": 0.8437,
      "step": 1775
    },
    {
      "epoch": 6.0,
      "eval_loss": 2.170966863632202,
      "eval_runtime": 0.3935,
      "eval_samples_per_second": 5.083,
      "eval_steps_per_second": 2.541,
      "step": 1776
    },
    {
      "epoch": 6.013513513513513,
      "grad_norm": 0.3619636595249176,
      "learning_rate": 8.217083288420241e-05,
      "loss": 0.7823,
      "step": 1780
    },
    {
      "epoch": 6.030405405405405,
      "grad_norm": 0.33571234345436096,
      "learning_rate": 8.159095529201049e-05,
      "loss": 0.7663,
      "step": 1785
    },
    {
      "epoch": 6.047297297297297,
      "grad_norm": 0.3377952575683594,
      "learning_rate": 8.101171773125716e-05,
      "loss": 0.764,
      "step": 1790
    },
    {
      "epoch": 6.0641891891891895,
      "grad_norm": 0.3851635754108429,
      "learning_rate": 8.043314034042631e-05,
      "loss": 0.7543,
      "step": 1795
    },
    {
      "epoch": 6.081081081081081,
      "grad_norm": 0.3411933481693268,
      "learning_rate": 7.985524323504948e-05,
      "loss": 0.7569,
      "step": 1800
    },
    {
      "epoch": 6.097972972972973,
      "grad_norm": 0.3682069480419159,
      "learning_rate": 7.927804650700659e-05,
      "loss": 0.7546,
      "step": 1805
    },
    {
      "epoch": 6.114864864864865,
      "grad_norm": 0.35545244812965393,
      "learning_rate": 7.870157022382735e-05,
      "loss": 0.7615,
      "step": 1810
    },
    {
      "epoch": 6.131756756756757,
      "grad_norm": 0.39011305570602417,
      "learning_rate": 7.812583442799368e-05,
      "loss": 0.7611,
      "step": 1815
    },
    {
      "epoch": 6.148648648648648,
      "grad_norm": 0.33269399404525757,
      "learning_rate": 7.755085913624274e-05,
      "loss": 0.7599,
      "step": 1820
    },
    {
      "epoch": 6.16554054054054,
      "grad_norm": 0.3615286946296692,
      "learning_rate": 7.697666433887108e-05,
      "loss": 0.7501,
      "step": 1825
    },
    {
      "epoch": 6.1824324324324325,
      "grad_norm": 0.3396786153316498,
      "learning_rate": 7.640326999903967e-05,
      "loss": 0.757,
      "step": 1830
    },
    {
      "epoch": 6.199324324324325,
      "grad_norm": 0.38157907128334045,
      "learning_rate": 7.583069605207975e-05,
      "loss": 0.7506,
      "step": 1835
    },
    {
      "epoch": 6.216216216216216,
      "grad_norm": 0.3560575842857361,
      "learning_rate": 7.525896240479976e-05,
      "loss": 0.754,
      "step": 1840
    },
    {
      "epoch": 6.233108108108108,
      "grad_norm": 0.3762560784816742,
      "learning_rate": 7.468808893479327e-05,
      "loss": 0.7614,
      "step": 1845
    },
    {
      "epoch": 6.25,
      "grad_norm": 0.36987847089767456,
      "learning_rate": 7.411809548974792e-05,
      "loss": 0.7637,
      "step": 1850
    },
    {
      "epoch": 6.266891891891892,
      "grad_norm": 0.406857967376709,
      "learning_rate": 7.354900188675525e-05,
      "loss": 0.761,
      "step": 1855
    },
    {
      "epoch": 6.283783783783784,
      "grad_norm": 0.3850703835487366,
      "learning_rate": 7.29808279116218e-05,
      "loss": 0.7656,
      "step": 1860
    },
    {
      "epoch": 6.300675675675675,
      "grad_norm": 0.34307488799095154,
      "learning_rate": 7.24135933181812e-05,
      "loss": 0.7501,
      "step": 1865
    },
    {
      "epoch": 6.3175675675675675,
      "grad_norm": 0.3922889232635498,
      "learning_rate": 7.184731782760746e-05,
      "loss": 0.7584,
      "step": 1870
    },
    {
      "epoch": 6.33445945945946,
      "grad_norm": 0.36379769444465637,
      "learning_rate": 7.128202112772912e-05,
      "loss": 0.7626,
      "step": 1875
    },
    {
      "epoch": 6.351351351351352,
      "grad_norm": 0.3796177804470062,
      "learning_rate": 7.071772287234497e-05,
      "loss": 0.7739,
      "step": 1880
    },
    {
      "epoch": 6.368243243243243,
      "grad_norm": 0.3752601146697998,
      "learning_rate": 7.015444268054059e-05,
      "loss": 0.7658,
      "step": 1885
    },
    {
      "epoch": 6.385135135135135,
      "grad_norm": 0.3463265597820282,
      "learning_rate": 6.959220013600641e-05,
      "loss": 0.7584,
      "step": 1890
    },
    {
      "epoch": 6.402027027027027,
      "grad_norm": 0.3532774746417999,
      "learning_rate": 6.903101478635662e-05,
      "loss": 0.7715,
      "step": 1895
    },
    {
      "epoch": 6.418918918918919,
      "grad_norm": 0.3608658015727997,
      "learning_rate": 6.847090614244977e-05,
      "loss": 0.7682,
      "step": 1900
    },
    {
      "epoch": 6.4358108108108105,
      "grad_norm": 0.39848268032073975,
      "learning_rate": 6.791189367771025e-05,
      "loss": 0.7658,
      "step": 1905
    },
    {
      "epoch": 6.452702702702703,
      "grad_norm": 0.3448575437068939,
      "learning_rate": 6.735399682745145e-05,
      "loss": 0.7736,
      "step": 1910
    },
    {
      "epoch": 6.469594594594595,
      "grad_norm": 0.3646429181098938,
      "learning_rate": 6.679723498819986e-05,
      "loss": 0.7657,
      "step": 1915
    },
    {
      "epoch": 6.486486486486487,
      "grad_norm": 0.3576849699020386,
      "learning_rate": 6.624162751702076e-05,
      "loss": 0.7741,
      "step": 1920
    },
    {
      "epoch": 6.503378378378378,
      "grad_norm": 0.3550150990486145,
      "learning_rate": 6.568719373084538e-05,
      "loss": 0.7636,
      "step": 1925
    },
    {
      "epoch": 6.52027027027027,
      "grad_norm": 0.3779493570327759,
      "learning_rate": 6.513395290579901e-05,
      "loss": 0.7641,
      "step": 1930
    },
    {
      "epoch": 6.537162162162162,
      "grad_norm": 0.36017805337905884,
      "learning_rate": 6.458192427653112e-05,
      "loss": 0.7676,
      "step": 1935
    },
    {
      "epoch": 6.554054054054054,
      "grad_norm": 0.38434022665023804,
      "learning_rate": 6.403112703554643e-05,
      "loss": 0.7701,
      "step": 1940
    },
    {
      "epoch": 6.570945945945946,
      "grad_norm": 0.358761191368103,
      "learning_rate": 6.348158033253773e-05,
      "loss": 0.7539,
      "step": 1945
    },
    {
      "epoch": 6.587837837837838,
      "grad_norm": 0.37006473541259766,
      "learning_rate": 6.293330327372005e-05,
      "loss": 0.7767,
      "step": 1950
    },
    {
      "epoch": 6.60472972972973,
      "grad_norm": 0.3721785247325897,
      "learning_rate": 6.238631492116644e-05,
      "loss": 0.7715,
      "step": 1955
    },
    {
      "epoch": 6.621621621621622,
      "grad_norm": 0.3626702129840851,
      "learning_rate": 6.184063429214515e-05,
      "loss": 0.766,
      "step": 1960
    },
    {
      "epoch": 6.638513513513513,
      "grad_norm": 0.37497058510780334,
      "learning_rate": 6.129628035845861e-05,
      "loss": 0.7658,
      "step": 1965
    },
    {
      "epoch": 6.655405405405405,
      "grad_norm": 0.36465275287628174,
      "learning_rate": 6.0753272045783625e-05,
      "loss": 0.7666,
      "step": 1970
    },
    {
      "epoch": 6.672297297297297,
      "grad_norm": 0.3648873567581177,
      "learning_rate": 6.021162823301358e-05,
      "loss": 0.7661,
      "step": 1975
    },
    {
      "epoch": 6.6891891891891895,
      "grad_norm": 0.3486686646938324,
      "learning_rate": 5.967136775160187e-05,
      "loss": 0.7638,
      "step": 1980
    },
    {
      "epoch": 6.706081081081081,
      "grad_norm": 0.36590924859046936,
      "learning_rate": 5.913250938490744e-05,
      "loss": 0.7753,
      "step": 1985
    },
    {
      "epoch": 6.722972972972973,
      "grad_norm": 0.36060139536857605,
      "learning_rate": 5.859507186754146e-05,
      "loss": 0.778,
      "step": 1990
    },
    {
      "epoch": 6.739864864864865,
      "grad_norm": 0.4011731743812561,
      "learning_rate": 5.80590738847162e-05,
      "loss": 0.7653,
      "step": 1995
    },
    {
      "epoch": 6.756756756756757,
      "grad_norm": 0.38411641120910645,
      "learning_rate": 5.752453407159522e-05,
      "loss": 0.76,
      "step": 2000
    },
    {
      "epoch": 6.773648648648649,
      "grad_norm": 0.37505170702934265,
      "learning_rate": 5.699147101264566e-05,
      "loss": 0.7709,
      "step": 2005
    },
    {
      "epoch": 6.79054054054054,
      "grad_norm": 0.3904276192188263,
      "learning_rate": 5.645990324099197e-05,
      "loss": 0.7659,
      "step": 2010
    },
    {
      "epoch": 6.8074324324324325,
      "grad_norm": 0.3751082420349121,
      "learning_rate": 5.5929849237771556e-05,
      "loss": 0.7564,
      "step": 2015
    },
    {
      "epoch": 6.824324324324325,
      "grad_norm": 0.3594505488872528,
      "learning_rate": 5.540132743149242e-05,
      "loss": 0.7723,
      "step": 2020
    },
    {
      "epoch": 6.841216216216216,
      "grad_norm": 0.3686336874961853,
      "learning_rate": 5.487435619739214e-05,
      "loss": 0.7645,
      "step": 2025
    },
    {
      "epoch": 6.858108108108108,
      "grad_norm": 0.37959080934524536,
      "learning_rate": 5.434895385679937e-05,
      "loss": 0.761,
      "step": 2030
    },
    {
      "epoch": 6.875,
      "grad_norm": 0.38148415088653564,
      "learning_rate": 5.382513867649663e-05,
      "loss": 0.766,
      "step": 2035
    },
    {
      "epoch": 6.891891891891892,
      "grad_norm": 0.37155023217201233,
      "learning_rate": 5.33029288680852e-05,
      "loss": 0.7753,
      "step": 2040
    },
    {
      "epoch": 6.908783783783784,
      "grad_norm": 0.3691665828227997,
      "learning_rate": 5.2782342587352154e-05,
      "loss": 0.7641,
      "step": 2045
    },
    {
      "epoch": 6.925675675675675,
      "grad_norm": 0.4007939398288727,
      "learning_rate": 5.226339793363898e-05,
      "loss": 0.7717,
      "step": 2050
    },
    {
      "epoch": 6.9425675675675675,
      "grad_norm": 0.36151981353759766,
      "learning_rate": 5.174611294921224e-05,
      "loss": 0.7832,
      "step": 2055
    },
    {
      "epoch": 6.95945945945946,
      "grad_norm": 0.38270819187164307,
      "learning_rate": 5.123050561863657e-05,
      "loss": 0.7619,
      "step": 2060
    },
    {
      "epoch": 6.976351351351351,
      "grad_norm": 0.35164088010787964,
      "learning_rate": 5.071659386814907e-05,
      "loss": 0.7725,
      "step": 2065
    },
    {
      "epoch": 6.993243243243243,
      "grad_norm": 0.3853191137313843,
      "learning_rate": 5.020439556503629e-05,
      "loss": 0.7654,
      "step": 2070
    },
    {
      "epoch": 7.0,
      "eval_loss": 2.40800142288208,
      "eval_runtime": 0.394,
      "eval_samples_per_second": 5.076,
      "eval_steps_per_second": 2.538,
      "step": 2072
    },
    {
      "epoch": 7.010135135135135,
      "grad_norm": 0.3015079200267792,
      "learning_rate": 4.969392851701305e-05,
      "loss": 0.7406,
      "step": 2075
    },
    {
      "epoch": 7.027027027027027,
      "grad_norm": 0.47633570432662964,
      "learning_rate": 4.918521047160308e-05,
      "loss": 0.7101,
      "step": 2080
    },
    {
      "epoch": 7.043918918918919,
      "grad_norm": 0.31147924065589905,
      "learning_rate": 4.8678259115522215e-05,
      "loss": 0.7144,
      "step": 2085
    },
    {
      "epoch": 7.0608108108108105,
      "grad_norm": 0.3377055823802948,
      "learning_rate": 4.817309207406346e-05,
      "loss": 0.7091,
      "step": 2090
    },
    {
      "epoch": 7.077702702702703,
      "grad_norm": 0.3804275393486023,
      "learning_rate": 4.7669726910484e-05,
      "loss": 0.7083,
      "step": 2095
    },
    {
      "epoch": 7.094594594594595,
      "grad_norm": 0.3246239721775055,
      "learning_rate": 4.716818112539485e-05,
      "loss": 0.7076,
      "step": 2100
    },
    {
      "epoch": 7.111486486486487,
      "grad_norm": 0.3758985996246338,
      "learning_rate": 4.666847215615226e-05,
      "loss": 0.7112,
      "step": 2105
    },
    {
      "epoch": 7.128378378378378,
      "grad_norm": 0.3744657337665558,
      "learning_rate": 4.617061737625139e-05,
      "loss": 0.714,
      "step": 2110
    },
    {
      "epoch": 7.14527027027027,
      "grad_norm": 0.35453036427497864,
      "learning_rate": 4.567463409472255e-05,
      "loss": 0.7144,
      "step": 2115
    },
    {
      "epoch": 7.162162162162162,
      "grad_norm": 0.36035045981407166,
      "learning_rate": 4.518053955552903e-05,
      "loss": 0.7153,
      "step": 2120
    },
    {
      "epoch": 7.179054054054054,
      "grad_norm": 0.362409383058548,
      "learning_rate": 4.468835093696796e-05,
      "loss": 0.7179,
      "step": 2125
    },
    {
      "epoch": 7.195945945945946,
      "grad_norm": 0.4178987145423889,
      "learning_rate": 4.419808535107287e-05,
      "loss": 0.7109,
      "step": 2130
    },
    {
      "epoch": 7.212837837837838,
      "grad_norm": 0.36226364970207214,
      "learning_rate": 4.370975984301866e-05,
      "loss": 0.7112,
      "step": 2135
    },
    {
      "epoch": 7.22972972972973,
      "grad_norm": 0.34748539328575134,
      "learning_rate": 4.322339139052921e-05,
      "loss": 0.7115,
      "step": 2140
    },
    {
      "epoch": 7.246621621621622,
      "grad_norm": 0.3634675443172455,
      "learning_rate": 4.273899690328702e-05,
      "loss": 0.7043,
      "step": 2145
    },
    {
      "epoch": 7.263513513513513,
      "grad_norm": 0.3675166070461273,
      "learning_rate": 4.2256593222345185e-05,
      "loss": 0.7124,
      "step": 2150
    },
    {
      "epoch": 7.280405405405405,
      "grad_norm": 0.33852246403694153,
      "learning_rate": 4.177619711954211e-05,
      "loss": 0.7122,
      "step": 2155
    },
    {
      "epoch": 7.297297297297297,
      "grad_norm": 0.34997648000717163,
      "learning_rate": 4.129782529691815e-05,
      "loss": 0.7161,
      "step": 2160
    },
    {
      "epoch": 7.3141891891891895,
      "grad_norm": 0.3947296738624573,
      "learning_rate": 4.082149438613514e-05,
      "loss": 0.715,
      "step": 2165
    },
    {
      "epoch": 7.331081081081081,
      "grad_norm": 0.3766041696071625,
      "learning_rate": 4.034722094789809e-05,
      "loss": 0.7104,
      "step": 2170
    },
    {
      "epoch": 7.347972972972973,
      "grad_norm": 0.39250659942626953,
      "learning_rate": 3.987502147137928e-05,
      "loss": 0.7157,
      "step": 2175
    },
    {
      "epoch": 7.364864864864865,
      "grad_norm": 0.356827050447464,
      "learning_rate": 3.9404912373645185e-05,
      "loss": 0.7104,
      "step": 2180
    },
    {
      "epoch": 7.381756756756757,
      "grad_norm": 0.3731355369091034,
      "learning_rate": 3.893690999908562e-05,
      "loss": 0.7167,
      "step": 2185
    },
    {
      "epoch": 7.398648648648648,
      "grad_norm": 0.3654830753803253,
      "learning_rate": 3.8471030618845375e-05,
      "loss": 0.7151,
      "step": 2190
    },
    {
      "epoch": 7.41554054054054,
      "grad_norm": 0.3466781675815582,
      "learning_rate": 3.800729043025871e-05,
      "loss": 0.7208,
      "step": 2195
    },
    {
      "epoch": 7.4324324324324325,
      "grad_norm": 0.37476223707199097,
      "learning_rate": 3.7545705556286126e-05,
      "loss": 0.7083,
      "step": 2200
    },
    {
      "epoch": 7.449324324324325,
      "grad_norm": 0.361871600151062,
      "learning_rate": 3.708629204495371e-05,
      "loss": 0.7195,
      "step": 2205
    },
    {
      "epoch": 7.466216216216216,
      "grad_norm": 0.3652123510837555,
      "learning_rate": 3.662906586879542e-05,
      "loss": 0.7132,
      "step": 2210
    },
    {
      "epoch": 7.483108108108108,
      "grad_norm": 0.36584657430648804,
      "learning_rate": 3.61740429242975e-05,
      "loss": 0.71,
      "step": 2215
    },
    {
      "epoch": 7.5,
      "grad_norm": 0.34037116169929504,
      "learning_rate": 3.5721239031346066e-05,
      "loss": 0.7175,
      "step": 2220
    },
    {
      "epoch": 7.516891891891892,
      "grad_norm": 0.34989210963249207,
      "learning_rate": 3.5270669932676926e-05,
      "loss": 0.7236,
      "step": 2225
    },
    {
      "epoch": 7.533783783783784,
      "grad_norm": 0.35882651805877686,
      "learning_rate": 3.48223512933282e-05,
      "loss": 0.7159,
      "step": 2230
    },
    {
      "epoch": 7.550675675675675,
      "grad_norm": 0.32638296484947205,
      "learning_rate": 3.437629870009591e-05,
      "loss": 0.7221,
      "step": 2235
    },
    {
      "epoch": 7.5675675675675675,
      "grad_norm": 0.37272724509239197,
      "learning_rate": 3.393252766099187e-05,
      "loss": 0.7132,
      "step": 2240
    },
    {
      "epoch": 7.58445945945946,
      "grad_norm": 0.3713020086288452,
      "learning_rate": 3.349105360470456e-05,
      "loss": 0.7246,
      "step": 2245
    },
    {
      "epoch": 7.601351351351351,
      "grad_norm": 0.35202324390411377,
      "learning_rate": 3.305189188006281e-05,
      "loss": 0.7289,
      "step": 2250
    },
    {
      "epoch": 7.618243243243243,
      "grad_norm": 0.3543793559074402,
      "learning_rate": 3.2615057755502e-05,
      "loss": 0.7129,
      "step": 2255
    },
    {
      "epoch": 7.635135135135135,
      "grad_norm": 0.3830936849117279,
      "learning_rate": 3.218056641853337e-05,
      "loss": 0.7287,
      "step": 2260
    },
    {
      "epoch": 7.652027027027027,
      "grad_norm": 0.36788904666900635,
      "learning_rate": 3.174843297521596e-05,
      "loss": 0.7107,
      "step": 2265
    },
    {
      "epoch": 7.668918918918919,
      "grad_norm": 0.34784045815467834,
      "learning_rate": 3.1318672449631284e-05,
      "loss": 0.7129,
      "step": 2270
    },
    {
      "epoch": 7.6858108108108105,
      "grad_norm": 0.3825985789299011,
      "learning_rate": 3.089129978336118e-05,
      "loss": 0.7048,
      "step": 2275
    },
    {
      "epoch": 7.702702702702703,
      "grad_norm": 0.4050070643424988,
      "learning_rate": 3.0466329834968233e-05,
      "loss": 0.7165,
      "step": 2280
    },
    {
      "epoch": 7.719594594594595,
      "grad_norm": 0.3602808117866516,
      "learning_rate": 3.0043777379479098e-05,
      "loss": 0.7163,
      "step": 2285
    },
    {
      "epoch": 7.736486486486487,
      "grad_norm": 0.35466307401657104,
      "learning_rate": 2.9623657107870996e-05,
      "loss": 0.7149,
      "step": 2290
    },
    {
      "epoch": 7.753378378378378,
      "grad_norm": 0.3452269732952118,
      "learning_rate": 2.9205983626560874e-05,
      "loss": 0.7196,
      "step": 2295
    },
    {
      "epoch": 7.77027027027027,
      "grad_norm": 0.3634475767612457,
      "learning_rate": 2.879077145689746e-05,
      "loss": 0.7153,
      "step": 2300
    },
    {
      "epoch": 7.787162162162162,
      "grad_norm": 0.3627691864967346,
      "learning_rate": 2.8378035034656625e-05,
      "loss": 0.7112,
      "step": 2305
    },
    {
      "epoch": 7.804054054054054,
      "grad_norm": 0.3404904901981354,
      "learning_rate": 2.7967788709539233e-05,
      "loss": 0.7159,
      "step": 2310
    },
    {
      "epoch": 7.820945945945946,
      "grad_norm": 0.38526642322540283,
      "learning_rate": 2.7560046744672495e-05,
      "loss": 0.7218,
      "step": 2315
    },
    {
      "epoch": 7.837837837837838,
      "grad_norm": 0.354755699634552,
      "learning_rate": 2.7154823316113932e-05,
      "loss": 0.7123,
      "step": 2320
    },
    {
      "epoch": 7.85472972972973,
      "grad_norm": 0.3782195746898651,
      "learning_rate": 2.6752132512358475e-05,
      "loss": 0.7091,
      "step": 2325
    },
    {
      "epoch": 7.871621621621622,
      "grad_norm": 0.39233171939849854,
      "learning_rate": 2.6351988333848788e-05,
      "loss": 0.7208,
      "step": 2330
    },
    {
      "epoch": 7.888513513513513,
      "grad_norm": 0.4432124197483063,
      "learning_rate": 2.5954404692488433e-05,
      "loss": 0.7032,
      "step": 2335
    },
    {
      "epoch": 7.905405405405405,
      "grad_norm": 0.3653867542743683,
      "learning_rate": 2.5559395411158115e-05,
      "loss": 0.7246,
      "step": 2340
    },
    {
      "epoch": 7.922297297297297,
      "grad_norm": 0.37708407640457153,
      "learning_rate": 2.5166974223235296e-05,
      "loss": 0.7135,
      "step": 2345
    },
    {
      "epoch": 7.9391891891891895,
      "grad_norm": 0.3550487160682678,
      "learning_rate": 2.4777154772116496e-05,
      "loss": 0.7105,
      "step": 2350
    },
    {
      "epoch": 7.956081081081081,
      "grad_norm": 0.35054445266723633,
      "learning_rate": 2.438995061074314e-05,
      "loss": 0.7179,
      "step": 2355
    },
    {
      "epoch": 7.972972972972973,
      "grad_norm": 0.35555845499038696,
      "learning_rate": 2.4005375201130274e-05,
      "loss": 0.7076,
      "step": 2360
    },
    {
      "epoch": 7.989864864864865,
      "grad_norm": 0.38198524713516235,
      "learning_rate": 2.362344191389846e-05,
      "loss": 0.7117,
      "step": 2365
    },
    {
      "epoch": 8.0,
      "eval_loss": 2.655390977859497,
      "eval_runtime": 0.3941,
      "eval_samples_per_second": 5.074,
      "eval_steps_per_second": 2.537,
      "step": 2368
    },
    {
      "epoch": 8.006756756756756,
      "grad_norm": 0.2672366499900818,
      "learning_rate": 2.324416402780907e-05,
      "loss": 0.7016,
      "step": 2370
    },
    {
      "epoch": 8.02364864864865,
      "grad_norm": 0.3170325756072998,
      "learning_rate": 2.2867554729302542e-05,
      "loss": 0.6812,
      "step": 2375
    },
    {
      "epoch": 8.04054054054054,
      "grad_norm": 0.3713083863258362,
      "learning_rate": 2.249362711203985e-05,
      "loss": 0.6825,
      "step": 2380
    },
    {
      "epoch": 8.057432432432432,
      "grad_norm": 0.3441585898399353,
      "learning_rate": 2.2122394176447416e-05,
      "loss": 0.6786,
      "step": 2385
    },
    {
      "epoch": 8.074324324324325,
      "grad_norm": 0.29649627208709717,
      "learning_rate": 2.1753868829265046e-05,
      "loss": 0.671,
      "step": 2390
    },
    {
      "epoch": 8.091216216216216,
      "grad_norm": 0.31710395216941833,
      "learning_rate": 2.1388063883097152e-05,
      "loss": 0.6788,
      "step": 2395
    },
    {
      "epoch": 8.108108108108109,
      "grad_norm": 0.3464438319206238,
      "learning_rate": 2.102499205596743e-05,
      "loss": 0.6843,
      "step": 2400
    },
    {
      "epoch": 8.125,
      "grad_norm": 0.3463502824306488,
      "learning_rate": 2.0664665970876496e-05,
      "loss": 0.6896,
      "step": 2405
    },
    {
      "epoch": 8.141891891891891,
      "grad_norm": 0.32347431778907776,
      "learning_rate": 2.0307098155363236e-05,
      "loss": 0.6949,
      "step": 2410
    },
    {
      "epoch": 8.158783783783784,
      "grad_norm": 0.30408981442451477,
      "learning_rate": 1.9952301041069122e-05,
      "loss": 0.6808,
      "step": 2415
    },
    {
      "epoch": 8.175675675675675,
      "grad_norm": 0.3631693124771118,
      "learning_rate": 1.9600286963305957e-05,
      "loss": 0.6882,
      "step": 2420
    },
    {
      "epoch": 8.192567567567568,
      "grad_norm": 0.31960511207580566,
      "learning_rate": 1.9251068160627173e-05,
      "loss": 0.6849,
      "step": 2425
    },
    {
      "epoch": 8.20945945945946,
      "grad_norm": 0.3153926134109497,
      "learning_rate": 1.8904656774402208e-05,
      "loss": 0.6768,
      "step": 2430
    },
    {
      "epoch": 8.22635135135135,
      "grad_norm": 0.3084424138069153,
      "learning_rate": 1.8561064848394382e-05,
      "loss": 0.6744,
      "step": 2435
    },
    {
      "epoch": 8.243243243243244,
      "grad_norm": 0.3217174708843231,
      "learning_rate": 1.8220304328342252e-05,
      "loss": 0.6882,
      "step": 2440
    },
    {
      "epoch": 8.260135135135135,
      "grad_norm": 0.3653244972229004,
      "learning_rate": 1.7882387061544182e-05,
      "loss": 0.6812,
      "step": 2445
    },
    {
      "epoch": 8.277027027027026,
      "grad_norm": 0.32076555490493774,
      "learning_rate": 1.754732479644655e-05,
      "loss": 0.6835,
      "step": 2450
    },
    {
      "epoch": 8.29391891891892,
      "grad_norm": 0.35145509243011475,
      "learning_rate": 1.721512918223527e-05,
      "loss": 0.6885,
      "step": 2455
    },
    {
      "epoch": 8.31081081081081,
      "grad_norm": 0.3196760416030884,
      "learning_rate": 1.688581176843066e-05,
      "loss": 0.6814,
      "step": 2460
    },
    {
      "epoch": 8.327702702702704,
      "grad_norm": 0.34739652276039124,
      "learning_rate": 1.6559384004486055e-05,
      "loss": 0.6856,
      "step": 2465
    },
    {
      "epoch": 8.344594594594595,
      "grad_norm": 0.3565291166305542,
      "learning_rate": 1.6235857239389696e-05,
      "loss": 0.6849,
      "step": 2470
    },
    {
      "epoch": 8.361486486486486,
      "grad_norm": 0.3656858205795288,
      "learning_rate": 1.5915242721270074e-05,
      "loss": 0.681,
      "step": 2475
    },
    {
      "epoch": 8.378378378378379,
      "grad_norm": 0.32651442289352417,
      "learning_rate": 1.5597551597004966e-05,
      "loss": 0.683,
      "step": 2480
    },
    {
      "epoch": 8.39527027027027,
      "grad_norm": 0.3386393189430237,
      "learning_rate": 1.5282794911833887e-05,
      "loss": 0.6823,
      "step": 2485
    },
    {
      "epoch": 8.412162162162161,
      "grad_norm": 0.31998586654663086,
      "learning_rate": 1.4970983608973942e-05,
      "loss": 0.6788,
      "step": 2490
    },
    {
      "epoch": 8.429054054054054,
      "grad_norm": 0.34341830015182495,
      "learning_rate": 1.4662128529239572e-05,
      "loss": 0.6944,
      "step": 2495
    },
    {
      "epoch": 8.445945945945946,
      "grad_norm": 0.32450416684150696,
      "learning_rate": 1.4356240410665433e-05,
      "loss": 0.6946,
      "step": 2500
    },
    {
      "epoch": 8.462837837837839,
      "grad_norm": 0.3322451710700989,
      "learning_rate": 1.4053329888133238e-05,
      "loss": 0.683,
      "step": 2505
    },
    {
      "epoch": 8.47972972972973,
      "grad_norm": 0.3628733456134796,
      "learning_rate": 1.3753407493001968e-05,
      "loss": 0.6824,
      "step": 2510
    },
    {
      "epoch": 8.496621621621621,
      "grad_norm": 0.3203790783882141,
      "learning_rate": 1.3456483652741591e-05,
      "loss": 0.6843,
      "step": 2515
    },
    {
      "epoch": 8.513513513513514,
      "grad_norm": 0.3382638096809387,
      "learning_rate": 1.3162568690570743e-05,
      "loss": 0.6882,
      "step": 2520
    },
    {
      "epoch": 8.530405405405405,
      "grad_norm": 0.34006133675575256,
      "learning_rate": 1.287167282509767e-05,
      "loss": 0.6781,
      "step": 2525
    },
    {
      "epoch": 8.547297297297296,
      "grad_norm": 0.33302438259124756,
      "learning_rate": 1.2583806169964961e-05,
      "loss": 0.6818,
      "step": 2530
    },
    {
      "epoch": 8.56418918918919,
      "grad_norm": 0.35714635252952576,
      "learning_rate": 1.2298978733498035e-05,
      "loss": 0.6903,
      "step": 2535
    },
    {
      "epoch": 8.58108108108108,
      "grad_norm": 0.34445202350616455,
      "learning_rate": 1.2017200418357078e-05,
      "loss": 0.6884,
      "step": 2540
    },
    {
      "epoch": 8.597972972972974,
      "grad_norm": 0.35791710019111633,
      "learning_rate": 1.1738481021192704e-05,
      "loss": 0.6805,
      "step": 2545
    },
    {
      "epoch": 8.614864864864865,
      "grad_norm": 0.4606862962245941,
      "learning_rate": 1.14628302323056e-05,
      "loss": 0.6833,
      "step": 2550
    },
    {
      "epoch": 8.631756756756756,
      "grad_norm": 0.3396778702735901,
      "learning_rate": 1.1190257635309275e-05,
      "loss": 0.6788,
      "step": 2555
    },
    {
      "epoch": 8.64864864864865,
      "grad_norm": 0.3137703537940979,
      "learning_rate": 1.0920772706797167e-05,
      "loss": 0.6778,
      "step": 2560
    },
    {
      "epoch": 8.66554054054054,
      "grad_norm": 0.3266281187534332,
      "learning_rate": 1.0654384816012953e-05,
      "loss": 0.6928,
      "step": 2565
    },
    {
      "epoch": 8.682432432432432,
      "grad_norm": 0.33806994557380676,
      "learning_rate": 1.0391103224524956e-05,
      "loss": 0.694,
      "step": 2570
    },
    {
      "epoch": 8.699324324324325,
      "grad_norm": 0.3242711126804352,
      "learning_rate": 1.013093708590408e-05,
      "loss": 0.6769,
      "step": 2575
    },
    {
      "epoch": 8.716216216216216,
      "grad_norm": 0.3551606833934784,
      "learning_rate": 9.873895445405523e-06,
      "loss": 0.6824,
      "step": 2580
    },
    {
      "epoch": 8.733108108108109,
      "grad_norm": 0.34394511580467224,
      "learning_rate": 9.619987239654405e-06,
      "loss": 0.681,
      "step": 2585
    },
    {
      "epoch": 8.75,
      "grad_norm": 0.35514023900032043,
      "learning_rate": 9.369221296335006e-06,
      "loss": 0.6908,
      "step": 2590
    },
    {
      "epoch": 8.766891891891891,
      "grad_norm": 0.31281572580337524,
      "learning_rate": 9.121606333883792e-06,
      "loss": 0.6881,
      "step": 2595
    },
    {
      "epoch": 8.783783783783784,
      "grad_norm": 0.3141974210739136,
      "learning_rate": 8.87715096118642e-06,
      "loss": 0.6797,
      "step": 2600
    },
    {
      "epoch": 8.800675675675675,
      "grad_norm": 0.3446739912033081,
      "learning_rate": 8.635863677278378e-06,
      "loss": 0.6862,
      "step": 2605
    },
    {
      "epoch": 8.817567567567568,
      "grad_norm": 0.3194230794906616,
      "learning_rate": 8.397752871049436e-06,
      "loss": 0.6764,
      "step": 2610
    },
    {
      "epoch": 8.83445945945946,
      "grad_norm": 0.3229275047779083,
      "learning_rate": 8.162826820952097e-06,
      "loss": 0.6868,
      "step": 2615
    },
    {
      "epoch": 8.85135135135135,
      "grad_norm": 0.3260205388069153,
      "learning_rate": 7.931093694713687e-06,
      "loss": 0.6917,
      "step": 2620
    },
    {
      "epoch": 8.868243243243244,
      "grad_norm": 0.3324912190437317,
      "learning_rate": 7.702561549052445e-06,
      "loss": 0.6748,
      "step": 2625
    },
    {
      "epoch": 8.885135135135135,
      "grad_norm": 0.3662506937980652,
      "learning_rate": 7.477238329397418e-06,
      "loss": 0.6918,
      "step": 2630
    },
    {
      "epoch": 8.902027027027026,
      "grad_norm": 0.3210934102535248,
      "learning_rate": 7.255131869612108e-06,
      "loss": 0.694,
      "step": 2635
    },
    {
      "epoch": 8.91891891891892,
      "grad_norm": 0.35362377762794495,
      "learning_rate": 7.03624989172228e-06,
      "loss": 0.678,
      "step": 2640
    },
    {
      "epoch": 8.93581081081081,
      "grad_norm": 0.3684268295764923,
      "learning_rate": 6.820600005647382e-06,
      "loss": 0.6913,
      "step": 2645
    },
    {
      "epoch": 8.952702702702704,
      "grad_norm": 0.3233438730239868,
      "learning_rate": 6.608189708935964e-06,
      "loss": 0.6818,
      "step": 2650
    },
    {
      "epoch": 8.969594594594595,
      "grad_norm": 0.3141653537750244,
      "learning_rate": 6.3990263865050695e-06,
      "loss": 0.6843,
      "step": 2655
    },
    {
      "epoch": 8.986486486486486,
      "grad_norm": 0.32477766275405884,
      "learning_rate": 6.1931173103834115e-06,
      "loss": 0.6916,
      "step": 2660
    },
    {
      "epoch": 9.0,
      "eval_loss": 2.91719913482666,
      "eval_runtime": 0.3931,
      "eval_samples_per_second": 5.088,
      "eval_steps_per_second": 2.544,
      "step": 2664
    },
    {
      "epoch": 9.003378378378379,
      "grad_norm": 0.26982101798057556,
      "learning_rate": 5.9904696394586405e-06,
      "loss": 0.6797,
      "step": 2665
    },
    {
      "epoch": 9.02027027027027,
      "grad_norm": 0.25923973321914673,
      "learning_rate": 5.791090419228351e-06,
      "loss": 0.6622,
      "step": 2670
    },
    {
      "epoch": 9.037162162162161,
      "grad_norm": 0.2826623022556305,
      "learning_rate": 5.594986581555173e-06,
      "loss": 0.6712,
      "step": 2675
    },
    {
      "epoch": 9.054054054054054,
      "grad_norm": 0.3254013657569885,
      "learning_rate": 5.402164944425758e-06,
      "loss": 0.6644,
      "step": 2680
    },
    {
      "epoch": 9.070945945945946,
      "grad_norm": 0.3010416626930237,
      "learning_rate": 5.212632211713797e-06,
      "loss": 0.6741,
      "step": 2685
    },
    {
      "epoch": 9.087837837837839,
      "grad_norm": 0.36080434918403625,
      "learning_rate": 5.026394972946813e-06,
      "loss": 0.6675,
      "step": 2690
    },
    {
      "epoch": 9.10472972972973,
      "grad_norm": 0.2993578314781189,
      "learning_rate": 4.843459703077202e-06,
      "loss": 0.6798,
      "step": 2695
    },
    {
      "epoch": 9.121621621621621,
      "grad_norm": 0.3179502785205841,
      "learning_rate": 4.66383276225707e-06,
      "loss": 0.6769,
      "step": 2700
    },
    {
      "epoch": 9.138513513513514,
      "grad_norm": 0.29940682649612427,
      "learning_rate": 4.487520395617029e-06,
      "loss": 0.6624,
      "step": 2705
    },
    {
      "epoch": 9.155405405405405,
      "grad_norm": 0.340571790933609,
      "learning_rate": 4.314528733049206e-06,
      "loss": 0.6626,
      "step": 2710
    },
    {
      "epoch": 9.172297297297296,
      "grad_norm": 0.3252014219760895,
      "learning_rate": 4.144863788993991e-06,
      "loss": 0.6798,
      "step": 2715
    },
    {
      "epoch": 9.18918918918919,
      "grad_norm": 0.3229629397392273,
      "learning_rate": 3.9785314622310495e-06,
      "loss": 0.675,
      "step": 2720
    },
    {
      "epoch": 9.20608108108108,
      "grad_norm": 0.2807313799858093,
      "learning_rate": 3.815537535674174e-06,
      "loss": 0.6765,
      "step": 2725
    },
    {
      "epoch": 9.222972972972974,
      "grad_norm": 0.2970227599143982,
      "learning_rate": 3.655887676170222e-06,
      "loss": 0.6678,
      "step": 2730
    },
    {
      "epoch": 9.239864864864865,
      "grad_norm": 0.3028378486633301,
      "learning_rate": 3.4995874343021094e-06,
      "loss": 0.6728,
      "step": 2735
    },
    {
      "epoch": 9.256756756756756,
      "grad_norm": 0.31875497102737427,
      "learning_rate": 3.3466422441958634e-06,
      "loss": 0.6761,
      "step": 2740
    },
    {
      "epoch": 9.27364864864865,
      "grad_norm": 0.319791316986084,
      "learning_rate": 3.1970574233316397e-06,
      "loss": 0.6623,
      "step": 2745
    },
    {
      "epoch": 9.29054054054054,
      "grad_norm": 0.2787954807281494,
      "learning_rate": 3.050838172358883e-06,
      "loss": 0.6679,
      "step": 2750
    },
    {
      "epoch": 9.307432432432432,
      "grad_norm": 0.3424752950668335,
      "learning_rate": 2.9079895749154927e-06,
      "loss": 0.6626,
      "step": 2755
    },
    {
      "epoch": 9.324324324324325,
      "grad_norm": 0.302796334028244,
      "learning_rate": 2.7685165974510986e-06,
      "loss": 0.6721,
      "step": 2760
    },
    {
      "epoch": 9.341216216216216,
      "grad_norm": 0.29907652735710144,
      "learning_rate": 2.6324240890544193e-06,
      "loss": 0.6696,
      "step": 2765
    },
    {
      "epoch": 9.358108108108109,
      "grad_norm": 0.31798794865608215,
      "learning_rate": 2.499716781284556e-06,
      "loss": 0.6705,
      "step": 2770
    },
    {
      "epoch": 9.375,
      "grad_norm": 0.3437163829803467,
      "learning_rate": 2.3703992880066638e-06,
      "loss": 0.6726,
      "step": 2775
    },
    {
      "epoch": 9.391891891891891,
      "grad_norm": 0.2871228754520416,
      "learning_rate": 2.2444761052313856e-06,
      "loss": 0.6715,
      "step": 2780
    },
    {
      "epoch": 9.408783783783784,
      "grad_norm": 0.2895369827747345,
      "learning_rate": 2.1219516109586056e-06,
      "loss": 0.6812,
      "step": 2785
    },
    {
      "epoch": 9.425675675675675,
      "grad_norm": 0.29224950075149536,
      "learning_rate": 2.002830065025263e-06,
      "loss": 0.6788,
      "step": 2790
    },
    {
      "epoch": 9.442567567567568,
      "grad_norm": 0.32183554768562317,
      "learning_rate": 1.8871156089572018e-06,
      "loss": 0.6731,
      "step": 2795
    },
    {
      "epoch": 9.45945945945946,
      "grad_norm": 0.30122798681259155,
      "learning_rate": 1.7748122658251876e-06,
      "loss": 0.6724,
      "step": 2800
    },
    {
      "epoch": 9.47635135135135,
      "grad_norm": 0.3141665458679199,
      "learning_rate": 1.665923940105074e-06,
      "loss": 0.6725,
      "step": 2805
    },
    {
      "epoch": 9.493243243243244,
      "grad_norm": 0.3088280260562897,
      "learning_rate": 1.56045441754199e-06,
      "loss": 0.6748,
      "step": 2810
    },
    {
      "epoch": 9.510135135135135,
      "grad_norm": 0.2941664457321167,
      "learning_rate": 1.4584073650187878e-06,
      "loss": 0.6656,
      "step": 2815
    },
    {
      "epoch": 9.527027027027026,
      "grad_norm": 0.3141193687915802,
      "learning_rate": 1.3597863304285475e-06,
      "loss": 0.6732,
      "step": 2820
    },
    {
      "epoch": 9.54391891891892,
      "grad_norm": 0.29874399304389954,
      "learning_rate": 1.2645947425511395e-06,
      "loss": 0.6749,
      "step": 2825
    },
    {
      "epoch": 9.56081081081081,
      "grad_norm": 0.2963665723800659,
      "learning_rate": 1.1728359109341446e-06,
      "loss": 0.6737,
      "step": 2830
    },
    {
      "epoch": 9.577702702702704,
      "grad_norm": 0.2843508720397949,
      "learning_rate": 1.0845130257777114e-06,
      "loss": 0.6758,
      "step": 2835
    },
    {
      "epoch": 9.594594594594595,
      "grad_norm": 0.3049289882183075,
      "learning_rate": 9.996291578236228e-07,
      "loss": 0.6771,
      "step": 2840
    },
    {
      "epoch": 9.611486486486486,
      "grad_norm": 0.2939779460430145,
      "learning_rate": 9.18187258248604e-07,
      "loss": 0.6655,
      "step": 2845
    },
    {
      "epoch": 9.628378378378379,
      "grad_norm": 0.2909776270389557,
      "learning_rate": 8.401901585616823e-07,
      "loss": 0.6745,
      "step": 2850
    },
    {
      "epoch": 9.64527027027027,
      "grad_norm": 0.3179049491882324,
      "learning_rate": 7.656405705057435e-07,
      "loss": 0.6621,
      "step": 2855
    },
    {
      "epoch": 9.662162162162161,
      "grad_norm": 0.31932151317596436,
      "learning_rate": 6.945410859632295e-07,
      "loss": 0.6615,
      "step": 2860
    },
    {
      "epoch": 9.679054054054054,
      "grad_norm": 0.33364373445510864,
      "learning_rate": 6.268941768660886e-07,
      "loss": 0.6752,
      "step": 2865
    },
    {
      "epoch": 9.695945945945946,
      "grad_norm": 0.3177048861980438,
      "learning_rate": 5.627021951097545e-07,
      "loss": 0.6793,
      "step": 2870
    },
    {
      "epoch": 9.712837837837839,
      "grad_norm": 0.3155570328235626,
      "learning_rate": 5.019673724714458e-07,
      "loss": 0.6557,
      "step": 2875
    },
    {
      "epoch": 9.72972972972973,
      "grad_norm": 0.2894674241542816,
      "learning_rate": 4.44691820532539e-07,
      "loss": 0.6679,
      "step": 2880
    },
    {
      "epoch": 9.746621621621621,
      "grad_norm": 0.2958497405052185,
      "learning_rate": 3.908775306051604e-07,
      "loss": 0.667,
      "step": 2885
    },
    {
      "epoch": 9.763513513513514,
      "grad_norm": 0.30099859833717346,
      "learning_rate": 3.405263736629416e-07,
      "loss": 0.6828,
      "step": 2890
    },
    {
      "epoch": 9.780405405405405,
      "grad_norm": 0.28636157512664795,
      "learning_rate": 2.9364010027599364e-07,
      "loss": 0.6692,
      "step": 2895
    },
    {
      "epoch": 9.797297297297296,
      "grad_norm": 0.3025320768356323,
      "learning_rate": 2.5022034055003364e-07,
      "loss": 0.6735,
      "step": 2900
    },
    {
      "epoch": 9.81418918918919,
      "grad_norm": 0.3249851167201996,
      "learning_rate": 2.1026860406970772e-07,
      "loss": 0.6678,
      "step": 2905
    },
    {
      "epoch": 9.83108108108108,
      "grad_norm": 0.29301130771636963,
      "learning_rate": 1.7378627984612207e-07,
      "loss": 0.6773,
      "step": 2910
    },
    {
      "epoch": 9.847972972972974,
      "grad_norm": 0.3139761686325073,
      "learning_rate": 1.4077463626852582e-07,
      "loss": 0.6585,
      "step": 2915
    },
    {
      "epoch": 9.864864864864865,
      "grad_norm": 0.2988782227039337,
      "learning_rate": 1.1123482106021322e-07,
      "loss": 0.6832,
      "step": 2920
    },
    {
      "epoch": 9.881756756756756,
      "grad_norm": 0.34064996242523193,
      "learning_rate": 8.516786123867748e-08,
      "loss": 0.6655,
      "step": 2925
    },
    {
      "epoch": 9.89864864864865,
      "grad_norm": 0.31197425723075867,
      "learning_rate": 6.25746630798063e-08,
      "loss": 0.666,
      "step": 2930
    },
    {
      "epoch": 9.91554054054054,
      "grad_norm": 0.3064212203025818,
      "learning_rate": 4.3456012086462436e-08,
      "loss": 0.6671,
      "step": 2935
    },
    {
      "epoch": 9.932432432432432,
      "grad_norm": 0.3351791501045227,
      "learning_rate": 2.7812572961127824e-08,
      "loss": 0.6746,
      "step": 2940
    },
    {
      "epoch": 9.949324324324325,
      "grad_norm": 0.2936666011810303,
      "learning_rate": 1.564488958279986e-08,
      "loss": 0.6647,
      "step": 2945
    },
    {
      "epoch": 9.966216216216216,
      "grad_norm": 0.29400986433029175,
      "learning_rate": 6.953384988095391e-09,
      "loss": 0.6718,
      "step": 2950
    },
    {
      "epoch": 9.983108108108109,
      "grad_norm": 0.32395681738853455,
      "learning_rate": 1.7383613565291612e-09,
      "loss": 0.6809,
      "step": 2955
    },
    {
      "epoch": 10.0,
      "grad_norm": 0.2938132882118225,
      "learning_rate": 0.0,
      "loss": 0.6652,
      "step": 2960
    },
    {
      "epoch": 10.0,
      "eval_loss": 3.0329792499542236,
      "eval_runtime": 0.4325,
      "eval_samples_per_second": 4.624,
      "eval_steps_per_second": 2.312,
      "step": 2960
    },
    {
      "epoch": 10.0,
      "step": 2960,
      "total_flos": 4.416382035459834e+18,
      "train_loss": 0.922980490487975,
      "train_runtime": 12382.7598,
      "train_samples_per_second": 7.645,
      "train_steps_per_second": 0.239
    }
  ],
  "logging_steps": 5,
  "max_steps": 2960,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 4.416382035459834e+18,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}