{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981298423724285, "eval_steps": 250, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021373230029388193, "grad_norm": 0.0, "learning_rate": 0.0, "logits": -2.9837684631347656, "logps": -115.88082885742188, "loss": 0.6931, "step": 1 }, { "epoch": 0.010686615014694095, "grad_norm": 8.930464805300087, "learning_rate": 2.127659574468085e-08, "logits": -3.0210494995117188, "logps": -248.44415283203125, "loss": 0.6931, "step": 5 }, { "epoch": 0.02137323002938819, "grad_norm": 8.432519303571977, "learning_rate": 7.446808510638298e-08, "logits": -3.02193021774292, "logps": -275.5105895996094, "loss": 0.6931, "step": 10 }, { "epoch": 0.03205984504408229, "grad_norm": 7.7153009709054565, "learning_rate": 1.1702127659574468e-07, "logits": -2.9250597953796387, "logps": -249.81982421875, "loss": 0.6931, "step": 15 }, { "epoch": 0.04274646005877638, "grad_norm": 7.572152443988097, "learning_rate": 1.702127659574468e-07, "logits": -2.9345571994781494, "logps": -223.3355255126953, "loss": 0.6929, "step": 20 }, { "epoch": 0.053433075073470476, "grad_norm": 7.884724670444082, "learning_rate": 2.2340425531914892e-07, "logits": -3.0166373252868652, "logps": -312.7939147949219, "loss": 0.6923, "step": 25 }, { "epoch": 0.06411969008816458, "grad_norm": 7.729653627105848, "learning_rate": 2.659574468085106e-07, "logits": -3.0231285095214844, "logps": -306.21307373046875, "loss": 0.6919, "step": 30 }, { "epoch": 0.07480630510285867, "grad_norm": 7.98852027810727, "learning_rate": 3.1914893617021275e-07, "logits": -2.9057810306549072, "logps": -247.4704132080078, "loss": 0.69, "step": 35 }, { "epoch": 0.08549292011755276, "grad_norm": 8.191251952750846, "learning_rate": 3.7234042553191484e-07, "logits": -2.913228988647461, "logps": -280.84930419921875, "loss": 0.688, "step": 40 }, { "epoch": 0.09617953513224686, "grad_norm": 8.378312950849653, "learning_rate": 4.25531914893617e-07, "logits": -2.826091766357422, "logps": -269.6914978027344, "loss": 0.688, "step": 45 }, { "epoch": 0.10686615014694095, "grad_norm": 7.92057505263041, "learning_rate": 4.787234042553192e-07, "logits": -2.6941583156585693, "logps": -309.75946044921875, "loss": 0.6851, "step": 50 }, { "epoch": 0.11755276516163506, "grad_norm": 7.937326122203284, "learning_rate": 4.999370587356267e-07, "logits": -2.85343861579895, "logps": -331.48486328125, "loss": 0.6826, "step": 55 }, { "epoch": 0.12823938017632916, "grad_norm": 7.600359120044827, "learning_rate": 4.995525324419337e-07, "logits": -2.7965731620788574, "logps": -286.6440734863281, "loss": 0.6845, "step": 60 }, { "epoch": 0.13892599519102325, "grad_norm": 7.5593698484934855, "learning_rate": 4.988189843662815e-07, "logits": -2.7934298515319824, "logps": -320.76580810546875, "loss": 0.681, "step": 65 }, { "epoch": 0.14961261020571734, "grad_norm": 7.521407260182464, "learning_rate": 4.977374404419837e-07, "logits": -2.6327431201934814, "logps": -297.3265075683594, "loss": 0.6797, "step": 70 }, { "epoch": 0.16029922522041143, "grad_norm": 8.344106265438244, "learning_rate": 4.963094133060148e-07, "logits": -2.5252535343170166, "logps": -288.6630859375, "loss": 0.6768, "step": 75 }, { "epoch": 0.17098584023510552, "grad_norm": 9.03033660276784, "learning_rate": 4.945369001834514e-07, "logits": -2.745051383972168, "logps": -342.77850341796875, "loss": 0.6856, "step": 80 }, { "epoch": 0.18167245524979964, "grad_norm": 8.441584179661763, "learning_rate": 4.924223800941717e-07, "logits": -2.5587422847747803, "logps": -329.79608154296875, "loss": 0.6769, "step": 85 }, { "epoch": 0.19235907026449373, "grad_norm": 13.039024055196084, "learning_rate": 4.899688103857222e-07, "logits": -2.7360036373138428, "logps": -246.222900390625, "loss": 0.6743, "step": 90 }, { "epoch": 0.20304568527918782, "grad_norm": 7.911130981674664, "learning_rate": 4.871796225971999e-07, "logits": -2.5480384826660156, "logps": -324.2531433105469, "loss": 0.678, "step": 95 }, { "epoch": 0.2137323002938819, "grad_norm": 8.032036275865368, "learning_rate": 4.840587176599343e-07, "logits": -2.576904773712158, "logps": -264.0362854003906, "loss": 0.6739, "step": 100 }, { "epoch": 0.224418915308576, "grad_norm": 8.305899304648538, "learning_rate": 4.806104604416823e-07, "logits": -2.6711044311523438, "logps": -349.4509582519531, "loss": 0.6773, "step": 105 }, { "epoch": 0.2351055303232701, "grad_norm": 7.324269587605388, "learning_rate": 4.768396736419662e-07, "logits": -2.653310537338257, "logps": -385.108642578125, "loss": 0.6636, "step": 110 }, { "epoch": 0.2457921453379642, "grad_norm": 8.12305136852524, "learning_rate": 4.7275163104709194e-07, "logits": -2.6198906898498535, "logps": -331.40966796875, "loss": 0.6708, "step": 115 }, { "epoch": 0.2564787603526583, "grad_norm": 6.905493593882431, "learning_rate": 4.683520501542824e-07, "logits": -2.5891175270080566, "logps": -303.38238525390625, "loss": 0.677, "step": 120 }, { "epoch": 0.2671653753673524, "grad_norm": 8.139183964012535, "learning_rate": 4.636470841752404e-07, "logits": -2.561048746109009, "logps": -336.3091125488281, "loss": 0.6752, "step": 125 }, { "epoch": 0.2778519903820465, "grad_norm": 9.432265788996354, "learning_rate": 4.5864331343032565e-07, "logits": -2.6842479705810547, "logps": -333.8880920410156, "loss": 0.6845, "step": 130 }, { "epoch": 0.2885386053967406, "grad_norm": 9.225837503684264, "learning_rate": 4.533477361453819e-07, "logits": -2.682619571685791, "logps": -268.5357971191406, "loss": 0.6731, "step": 135 }, { "epoch": 0.2992252204114347, "grad_norm": 8.537025837100762, "learning_rate": 4.4776775866408533e-07, "logits": -2.633347988128662, "logps": -332.82281494140625, "loss": 0.677, "step": 140 }, { "epoch": 0.30991183542612877, "grad_norm": 10.164838947169544, "learning_rate": 4.4310423980614986e-07, "logits": -2.486520290374756, "logps": -341.2948303222656, "loss": 0.6868, "step": 145 }, { "epoch": 0.32059845044082286, "grad_norm": 8.40914055018219, "learning_rate": 4.370322686513817e-07, "logits": -2.601824998855591, "logps": -305.33953857421875, "loss": 0.6778, "step": 150 }, { "epoch": 0.33128506545551695, "grad_norm": 10.386990038499732, "learning_rate": 4.3069871595684787e-07, "logits": -2.670980453491211, "logps": -345.46063232421875, "loss": 0.6651, "step": 155 }, { "epoch": 0.34197168047021104, "grad_norm": 9.195750501852501, "learning_rate": 4.2411243976869173e-07, "logits": -2.6275370121002197, "logps": -323.1390686035156, "loss": 0.7026, "step": 160 }, { "epoch": 0.3526582954849052, "grad_norm": 8.823892064639073, "learning_rate": 4.172826515897145e-07, "logits": -2.673668146133423, "logps": -332.9339599609375, "loss": 0.6733, "step": 165 }, { "epoch": 0.36334491049959927, "grad_norm": 8.908227684061753, "learning_rate": 4.10218903496256e-07, "logits": -2.6081340312957764, "logps": -293.17620849609375, "loss": 0.6728, "step": 170 }, { "epoch": 0.37403152551429336, "grad_norm": 12.733930614190042, "learning_rate": 4.0293107477875156e-07, "logits": -2.685481309890747, "logps": -310.6298828125, "loss": 0.6754, "step": 175 }, { "epoch": 0.38471814052898745, "grad_norm": 7.791125734450306, "learning_rate": 3.954293581246514e-07, "logits": -2.6595892906188965, "logps": -302.4372863769531, "loss": 0.671, "step": 180 }, { "epoch": 0.39540475554368154, "grad_norm": 9.66885260759438, "learning_rate": 3.877242453630256e-07, "logits": -2.6837239265441895, "logps": -266.3741760253906, "loss": 0.678, "step": 185 }, { "epoch": 0.40609137055837563, "grad_norm": 9.607285136212285, "learning_rate": 3.7982651279079227e-07, "logits": -2.8104248046875, "logps": -334.6470031738281, "loss": 0.6632, "step": 190 }, { "epoch": 0.4167779855730697, "grad_norm": 15.100071189486846, "learning_rate": 3.717472061010918e-07, "logits": -2.7275915145874023, "logps": -393.0242614746094, "loss": 0.6711, "step": 195 }, { "epoch": 0.4274646005877638, "grad_norm": 8.499488078580452, "learning_rate": 3.634976249348867e-07, "logits": -2.664275646209717, "logps": -266.42816162109375, "loss": 0.6614, "step": 200 }, { "epoch": 0.4381512156024579, "grad_norm": 9.516842385793971, "learning_rate": 3.550893070773914e-07, "logits": -2.6723501682281494, "logps": -310.09271240234375, "loss": 0.6788, "step": 205 }, { "epoch": 0.448837830617152, "grad_norm": 8.325313537261211, "learning_rate": 3.465340123214365e-07, "logits": -2.6248607635498047, "logps": -324.2633972167969, "loss": 0.6701, "step": 210 }, { "epoch": 0.45952444563184613, "grad_norm": 8.428994519322705, "learning_rate": 3.378437060203357e-07, "logits": -2.538236141204834, "logps": -325.068359375, "loss": 0.6721, "step": 215 }, { "epoch": 0.4702110606465402, "grad_norm": 11.389237476698737, "learning_rate": 3.2903054235325613e-07, "logits": -2.498873472213745, "logps": -243.1177520751953, "loss": 0.6709, "step": 220 }, { "epoch": 0.4808976756612343, "grad_norm": 9.131041826797441, "learning_rate": 3.201068473265007e-07, "logits": -2.6873362064361572, "logps": -343.89166259765625, "loss": 0.6663, "step": 225 }, { "epoch": 0.4915842906759284, "grad_norm": 9.723212268353656, "learning_rate": 3.110851015344735e-07, "logits": -2.7327117919921875, "logps": -379.72003173828125, "loss": 0.6691, "step": 230 }, { "epoch": 0.5022709056906225, "grad_norm": 9.379555208168856, "learning_rate": 3.0197792270443976e-07, "logits": -2.6129934787750244, "logps": -284.4798583984375, "loss": 0.6758, "step": 235 }, { "epoch": 0.5129575207053166, "grad_norm": 9.664766735366735, "learning_rate": 2.927980480494938e-07, "logits": -2.6431000232696533, "logps": -367.9956359863281, "loss": 0.675, "step": 240 }, { "epoch": 0.5236441357200107, "grad_norm": 10.475237985525043, "learning_rate": 2.8355831645441387e-07, "logits": -2.670294761657715, "logps": -308.0411682128906, "loss": 0.6628, "step": 245 }, { "epoch": 0.5343307507347048, "grad_norm": 11.781792249315561, "learning_rate": 2.74271650519322e-07, "logits": -2.6975350379943848, "logps": -301.78369140625, "loss": 0.6722, "step": 250 }, { "epoch": 0.5343307507347048, "eval_logits": -2.6895368099212646, "eval_logps": -337.5862121582031, "eval_loss": 0.6677098870277405, "eval_runtime": 152.1721, "eval_samples_per_second": 12.933, "eval_steps_per_second": 0.808, "step": 250 }, { "epoch": 0.5450173657493989, "grad_norm": 10.564914594618568, "learning_rate": 2.6495103848625854e-07, "logits": -2.5754435062408447, "logps": -323.0116271972656, "loss": 0.6674, "step": 255 }, { "epoch": 0.555703980764093, "grad_norm": 9.246806479887502, "learning_rate": 2.5560951607395126e-07, "logits": -2.5769031047821045, "logps": -314.10772705078125, "loss": 0.6585, "step": 260 }, { "epoch": 0.566390595778787, "grad_norm": 9.208791503556593, "learning_rate": 2.4626014824618413e-07, "logits": -2.6613810062408447, "logps": -321.3485412597656, "loss": 0.676, "step": 265 }, { "epoch": 0.5770772107934812, "grad_norm": 11.748525348382621, "learning_rate": 2.3691601093926402e-07, "logits": -2.3850035667419434, "logps": -328.6526794433594, "loss": 0.6642, "step": 270 }, { "epoch": 0.5877638258081752, "grad_norm": 9.261892756803006, "learning_rate": 2.2759017277414164e-07, "logits": -2.651516914367676, "logps": -333.8236083984375, "loss": 0.6673, "step": 275 }, { "epoch": 0.5984504408228694, "grad_norm": 8.534866244469283, "learning_rate": 2.1829567677876297e-07, "logits": -2.4086098670959473, "logps": -329.608154296875, "loss": 0.6682, "step": 280 }, { "epoch": 0.6091370558375635, "grad_norm": 10.317866518950332, "learning_rate": 2.0904552214621556e-07, "logits": -2.4922032356262207, "logps": -267.16131591796875, "loss": 0.6687, "step": 285 }, { "epoch": 0.6198236708522575, "grad_norm": 8.474745725279664, "learning_rate": 1.998526460541818e-07, "logits": -2.5886034965515137, "logps": -330.1689453125, "loss": 0.659, "step": 290 }, { "epoch": 0.6305102858669517, "grad_norm": 9.41712854760729, "learning_rate": 1.9072990557112564e-07, "logits": -2.501155138015747, "logps": -281.8408203125, "loss": 0.6736, "step": 295 }, { "epoch": 0.6411969008816457, "grad_norm": 8.663658343885174, "learning_rate": 1.8169005967452e-07, "logits": -2.537830352783203, "logps": -292.0025634765625, "loss": 0.6582, "step": 300 }, { "epoch": 0.6518835158963399, "grad_norm": 9.426928387053838, "learning_rate": 1.7274575140626315e-07, "logits": -2.686964511871338, "logps": -318.72418212890625, "loss": 0.6638, "step": 305 }, { "epoch": 0.6625701309110339, "grad_norm": 11.485762996913175, "learning_rate": 1.6390949019024118e-07, "logits": -2.495753049850464, "logps": -287.70172119140625, "loss": 0.666, "step": 310 }, { "epoch": 0.673256745925728, "grad_norm": 9.088284794504979, "learning_rate": 1.5519363433676791e-07, "logits": -2.664257764816284, "logps": -350.0900573730469, "loss": 0.6556, "step": 315 }, { "epoch": 0.6839433609404221, "grad_norm": 9.483810437875452, "learning_rate": 1.4661037375836987e-07, "logits": -2.5566890239715576, "logps": -316.168212890625, "loss": 0.6783, "step": 320 }, { "epoch": 0.6946299759551162, "grad_norm": 10.184481775084981, "learning_rate": 1.381717129210918e-07, "logits": -2.7735507488250732, "logps": -386.7801208496094, "loss": 0.6583, "step": 325 }, { "epoch": 0.7053165909698104, "grad_norm": 10.117725271674823, "learning_rate": 1.2988945405516565e-07, "logits": -2.6583659648895264, "logps": -343.6266784667969, "loss": 0.6671, "step": 330 }, { "epoch": 0.7160032059845044, "grad_norm": 8.87252387776569, "learning_rate": 1.2177518064852348e-07, "logits": -2.5552642345428467, "logps": -395.2294616699219, "loss": 0.6579, "step": 335 }, { "epoch": 0.7266898209991985, "grad_norm": 10.850399193023186, "learning_rate": 1.1384024124624322e-07, "logits": -2.634152412414551, "logps": -305.1641540527344, "loss": 0.6651, "step": 340 }, { "epoch": 0.7373764360138926, "grad_norm": 9.124407290392456, "learning_rate": 1.0609573357858165e-07, "logits": -2.5021727085113525, "logps": -333.5446472167969, "loss": 0.6588, "step": 345 }, { "epoch": 0.7480630510285867, "grad_norm": 11.09237635980714, "learning_rate": 9.855248903979505e-08, "logits": -2.5584359169006348, "logps": -283.8270568847656, "loss": 0.6628, "step": 350 }, { "epoch": 0.7587496660432808, "grad_norm": 9.722731688636316, "learning_rate": 9.12210575394553e-08, "logits": -2.6663193702697754, "logps": -297.99713134765625, "loss": 0.6759, "step": 355 }, { "epoch": 0.7694362810579749, "grad_norm": 10.127170920190936, "learning_rate": 8.411169274744723e-08, "logits": -2.684257745742798, "logps": -356.31353759765625, "loss": 0.6568, "step": 360 }, { "epoch": 0.7801228960726689, "grad_norm": 9.798211994559338, "learning_rate": 7.723433775328384e-08, "logits": -2.64136004447937, "logps": -322.2811279296875, "loss": 0.6467, "step": 365 }, { "epoch": 0.7908095110873631, "grad_norm": 9.946974721999554, "learning_rate": 7.059861115979701e-08, "logits": -2.452336311340332, "logps": -299.15093994140625, "loss": 0.6614, "step": 370 }, { "epoch": 0.8014961261020572, "grad_norm": 9.584584883376387, "learning_rate": 6.42137936306514e-08, "logits": -2.566228151321411, "logps": -320.6563415527344, "loss": 0.6697, "step": 375 }, { "epoch": 0.8121827411167513, "grad_norm": 9.727606468468506, "learning_rate": 5.808881491049722e-08, "logits": -2.479376792907715, "logps": -332.4024658203125, "loss": 0.6573, "step": 380 }, { "epoch": 0.8228693561314454, "grad_norm": 9.526007033344321, "learning_rate": 5.223224133591475e-08, "logits": -2.4227657318115234, "logps": -306.6590270996094, "loss": 0.6617, "step": 385 }, { "epoch": 0.8335559711461394, "grad_norm": 9.488572830582953, "learning_rate": 4.6652263854618016e-08, "logits": -2.566108226776123, "logps": -332.1501770019531, "loss": 0.661, "step": 390 }, { "epoch": 0.8442425861608336, "grad_norm": 10.051151018954556, "learning_rate": 4.1356686569674335e-08, "logits": -2.591771125793457, "logps": -274.6145324707031, "loss": 0.6534, "step": 395 }, { "epoch": 0.8549292011755276, "grad_norm": 9.814390347137238, "learning_rate": 3.635291582475963e-08, "logits": -2.6314148902893066, "logps": -332.80999755859375, "loss": 0.6655, "step": 400 }, { "epoch": 0.8656158161902218, "grad_norm": 9.553236466348144, "learning_rate": 3.1647949845717585e-08, "logits": -2.5789246559143066, "logps": -342.28094482421875, "loss": 0.6595, "step": 405 }, { "epoch": 0.8763024312049158, "grad_norm": 9.981545726752802, "learning_rate": 2.724836895290805e-08, "logits": -2.460716724395752, "logps": -329.7812805175781, "loss": 0.6672, "step": 410 }, { "epoch": 0.88698904621961, "grad_norm": 9.457968496568347, "learning_rate": 2.3160326358033778e-08, "logits": -2.5351109504699707, "logps": -276.1766662597656, "loss": 0.653, "step": 415 }, { "epoch": 0.897675661234304, "grad_norm": 10.067353183112814, "learning_rate": 2.0118056862137354e-08, "logits": -2.698438882827759, "logps": -332.7668762207031, "loss": 0.6619, "step": 420 }, { "epoch": 0.9083622762489981, "grad_norm": 9.069273483908287, "learning_rate": 1.6604893375699592e-08, "logits": -2.516479253768921, "logps": -315.84356689453125, "loss": 0.6641, "step": 425 }, { "epoch": 0.9190488912636923, "grad_norm": 8.858983110483033, "learning_rate": 1.3418154050208936e-08, "logits": -2.648934841156006, "logps": -333.3450622558594, "loss": 0.6491, "step": 430 }, { "epoch": 0.9297355062783863, "grad_norm": 10.177521918208576, "learning_rate": 1.0562295828767387e-08, "logits": -2.659092426300049, "logps": -308.5892028808594, "loss": 0.6619, "step": 435 }, { "epoch": 0.9404221212930804, "grad_norm": 11.269119195089573, "learning_rate": 8.041312887333396e-09, "logits": -2.692126512527466, "logps": -286.5995788574219, "loss": 0.6705, "step": 440 }, { "epoch": 0.9511087363077745, "grad_norm": 10.03615285565061, "learning_rate": 5.858731048505927e-09, "logits": -2.552562952041626, "logps": -371.7760925292969, "loss": 0.6662, "step": 445 }, { "epoch": 0.9617953513224686, "grad_norm": 9.553853517010364, "learning_rate": 4.0176028503425826e-09, "logits": -2.4844257831573486, "logps": -333.4014892578125, "loss": 0.6587, "step": 450 }, { "epoch": 0.9724819663371627, "grad_norm": 12.99059133186568, "learning_rate": 2.5205032771092592e-09, "logits": -2.505462169647217, "logps": -320.8719787597656, "loss": 0.6561, "step": 455 }, { "epoch": 0.9831685813518568, "grad_norm": 10.396030840409205, "learning_rate": 1.3695261579316775e-09, "logits": -2.4493813514709473, "logps": -252.605712890625, "loss": 0.6659, "step": 460 }, { "epoch": 0.9938551963665508, "grad_norm": 10.381128544746161, "learning_rate": 5.662812383859794e-10, "logits": -2.598538637161255, "logps": -321.62518310546875, "loss": 0.6579, "step": 465 }, { "epoch": 0.9981298423724285, "step": 467, "total_flos": 0.0, "train_loss": 0.6707821693869679, "train_runtime": 12010.3686, "train_samples_per_second": 4.986, "train_steps_per_second": 0.039 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }