{ "best_metric": null, "best_model_checkpoint": null, "epoch": 22.448263767099263, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11, "grad_norm": 0.5909375548362732, "learning_rate": 1.9932584269662923e-05, "loss": 2.0237, "step": 20 }, { "epoch": 0.22, "grad_norm": 0.5826025009155273, "learning_rate": 1.9857677902621722e-05, "loss": 1.9306, "step": 40 }, { "epoch": 0.34, "grad_norm": 0.5491089820861816, "learning_rate": 1.9782771535580525e-05, "loss": 1.7959, "step": 60 }, { "epoch": 0.45, "grad_norm": 1.362810730934143, "learning_rate": 1.970786516853933e-05, "loss": 1.6599, "step": 80 }, { "epoch": 0.56, "grad_norm": 1.4427486658096313, "learning_rate": 1.963295880149813e-05, "loss": 1.5685, "step": 100 }, { "epoch": 0.67, "grad_norm": 0.9993659257888794, "learning_rate": 1.956179775280899e-05, "loss": 1.4621, "step": 120 }, { "epoch": 0.79, "grad_norm": 1.614562749862671, "learning_rate": 1.9486891385767793e-05, "loss": 1.31, "step": 140 }, { "epoch": 0.9, "grad_norm": 1.1975798606872559, "learning_rate": 1.9411985018726593e-05, "loss": 1.2322, "step": 160 }, { "epoch": 1.01, "grad_norm": 0.7684128880500793, "learning_rate": 1.9337078651685396e-05, "loss": 1.1361, "step": 180 }, { "epoch": 1.12, "grad_norm": 0.9336960911750793, "learning_rate": 1.9262172284644195e-05, "loss": 1.0797, "step": 200 }, { "epoch": 1.23, "grad_norm": 0.8471770882606506, "learning_rate": 1.9187265917603e-05, "loss": 1.0368, "step": 220 }, { "epoch": 1.35, "grad_norm": 1.111340045928955, "learning_rate": 1.9112359550561798e-05, "loss": 0.9738, "step": 240 }, { "epoch": 1.46, "grad_norm": 0.8093781471252441, "learning_rate": 1.90374531835206e-05, "loss": 0.9494, "step": 260 }, { "epoch": 1.57, "grad_norm": 0.8438062071800232, "learning_rate": 1.89625468164794e-05, "loss": 0.9276, "step": 280 }, { "epoch": 1.68, "grad_norm": 0.9896701574325562, "learning_rate": 1.8887640449438204e-05, "loss": 0.8656, "step": 300 }, { "epoch": 1.8, "grad_norm": 0.8278244137763977, "learning_rate": 1.8812734082397007e-05, "loss": 0.8431, "step": 320 }, { "epoch": 1.91, "grad_norm": 0.931291937828064, "learning_rate": 1.8737827715355807e-05, "loss": 0.7945, "step": 340 }, { "epoch": 2.02, "grad_norm": 1.21769380569458, "learning_rate": 1.866292134831461e-05, "loss": 0.7647, "step": 360 }, { "epoch": 2.13, "grad_norm": 3.5183286666870117, "learning_rate": 1.858801498127341e-05, "loss": 0.7497, "step": 380 }, { "epoch": 2.24, "grad_norm": 1.1153030395507812, "learning_rate": 1.8513108614232212e-05, "loss": 0.7507, "step": 400 }, { "epoch": 2.36, "grad_norm": 1.0140526294708252, "learning_rate": 1.8438202247191012e-05, "loss": 0.7415, "step": 420 }, { "epoch": 2.47, "grad_norm": 1.4395232200622559, "learning_rate": 1.8363295880149815e-05, "loss": 0.6947, "step": 440 }, { "epoch": 2.58, "grad_norm": 1.4253089427947998, "learning_rate": 1.8288389513108615e-05, "loss": 0.7429, "step": 460 }, { "epoch": 2.69, "grad_norm": 1.3152351379394531, "learning_rate": 1.8213483146067418e-05, "loss": 0.7363, "step": 480 }, { "epoch": 2.81, "grad_norm": 2.5935957431793213, "learning_rate": 1.8138576779026217e-05, "loss": 0.6486, "step": 500 }, { "epoch": 2.92, "grad_norm": 3.929158926010132, "learning_rate": 1.806367041198502e-05, "loss": 0.6395, "step": 520 }, { "epoch": 3.03, "grad_norm": 1.7316572666168213, "learning_rate": 1.7988764044943823e-05, "loss": 0.664, "step": 540 }, { "epoch": 3.14, "grad_norm": 1.3388841152191162, "learning_rate": 1.7913857677902623e-05, "loss": 0.6469, "step": 560 }, { "epoch": 3.25, "grad_norm": 1.5258549451828003, "learning_rate": 1.7838951310861426e-05, "loss": 0.6662, "step": 580 }, { "epoch": 3.37, "grad_norm": 1.5486094951629639, "learning_rate": 1.7764044943820226e-05, "loss": 0.566, "step": 600 }, { "epoch": 3.48, "grad_norm": 1.5657902956008911, "learning_rate": 1.768913857677903e-05, "loss": 0.6166, "step": 620 }, { "epoch": 3.59, "grad_norm": 1.5971391201019287, "learning_rate": 1.761423220973783e-05, "loss": 0.5973, "step": 640 }, { "epoch": 3.7, "grad_norm": 1.333030343055725, "learning_rate": 1.753932584269663e-05, "loss": 0.6117, "step": 660 }, { "epoch": 3.82, "grad_norm": 1.4425445795059204, "learning_rate": 1.746441947565543e-05, "loss": 0.5702, "step": 680 }, { "epoch": 3.93, "grad_norm": 1.4773032665252686, "learning_rate": 1.7389513108614234e-05, "loss": 0.5465, "step": 700 }, { "epoch": 4.04, "grad_norm": 1.3328267335891724, "learning_rate": 1.7314606741573034e-05, "loss": 0.5379, "step": 720 }, { "epoch": 4.15, "grad_norm": 1.6961455345153809, "learning_rate": 1.7239700374531837e-05, "loss": 0.5492, "step": 740 }, { "epoch": 4.27, "grad_norm": 1.4636189937591553, "learning_rate": 1.7164794007490637e-05, "loss": 0.547, "step": 760 }, { "epoch": 4.38, "grad_norm": 2.1686649322509766, "learning_rate": 1.708988764044944e-05, "loss": 0.5424, "step": 780 }, { "epoch": 4.49, "grad_norm": 1.219388723373413, "learning_rate": 1.7014981273408243e-05, "loss": 0.5373, "step": 800 }, { "epoch": 4.6, "grad_norm": 1.5566452741622925, "learning_rate": 1.6940074906367042e-05, "loss": 0.4944, "step": 820 }, { "epoch": 4.71, "grad_norm": 1.598917841911316, "learning_rate": 1.6865168539325845e-05, "loss": 0.5036, "step": 840 }, { "epoch": 4.83, "grad_norm": 1.5281039476394653, "learning_rate": 1.6790262172284645e-05, "loss": 0.5215, "step": 860 }, { "epoch": 4.94, "grad_norm": 1.7123130559921265, "learning_rate": 1.6715355805243448e-05, "loss": 0.5362, "step": 880 }, { "epoch": 5.05, "grad_norm": 1.543447732925415, "learning_rate": 1.6640449438202248e-05, "loss": 0.5379, "step": 900 }, { "epoch": 5.16, "grad_norm": 2.4190192222595215, "learning_rate": 1.656554307116105e-05, "loss": 0.4921, "step": 920 }, { "epoch": 5.28, "grad_norm": 2.190906047821045, "learning_rate": 1.649063670411985e-05, "loss": 0.4652, "step": 940 }, { "epoch": 5.39, "grad_norm": 2.113476514816284, "learning_rate": 1.6415730337078653e-05, "loss": 0.4914, "step": 960 }, { "epoch": 5.5, "grad_norm": 1.8785656690597534, "learning_rate": 1.6340823970037453e-05, "loss": 0.5135, "step": 980 }, { "epoch": 5.61, "grad_norm": 1.3745977878570557, "learning_rate": 1.6265917602996256e-05, "loss": 0.4697, "step": 1000 }, { "epoch": 5.72, "grad_norm": 1.7874308824539185, "learning_rate": 1.6191011235955056e-05, "loss": 0.4625, "step": 1020 }, { "epoch": 5.84, "grad_norm": 1.4448940753936768, "learning_rate": 1.611610486891386e-05, "loss": 0.4764, "step": 1040 }, { "epoch": 5.95, "grad_norm": 2.278655767440796, "learning_rate": 1.6041198501872662e-05, "loss": 0.4221, "step": 1060 }, { "epoch": 6.06, "grad_norm": 1.8602409362792969, "learning_rate": 1.596629213483146e-05, "loss": 0.4731, "step": 1080 }, { "epoch": 6.17, "grad_norm": 1.884373426437378, "learning_rate": 1.5891385767790265e-05, "loss": 0.4241, "step": 1100 }, { "epoch": 6.29, "grad_norm": 2.0259287357330322, "learning_rate": 1.5816479400749064e-05, "loss": 0.4368, "step": 1120 }, { "epoch": 6.4, "grad_norm": 1.812462329864502, "learning_rate": 1.5741573033707867e-05, "loss": 0.442, "step": 1140 }, { "epoch": 6.51, "grad_norm": 1.934327483177185, "learning_rate": 1.5666666666666667e-05, "loss": 0.4195, "step": 1160 }, { "epoch": 6.62, "grad_norm": 1.6152955293655396, "learning_rate": 1.559176029962547e-05, "loss": 0.4374, "step": 1180 }, { "epoch": 6.73, "grad_norm": 2.7782068252563477, "learning_rate": 1.551685393258427e-05, "loss": 0.4231, "step": 1200 }, { "epoch": 6.85, "grad_norm": 2.372976303100586, "learning_rate": 1.5441947565543073e-05, "loss": 0.444, "step": 1220 }, { "epoch": 6.96, "grad_norm": 2.171353816986084, "learning_rate": 1.5367041198501872e-05, "loss": 0.4389, "step": 1240 }, { "epoch": 7.07, "grad_norm": 1.3093984127044678, "learning_rate": 1.5292134831460675e-05, "loss": 0.4301, "step": 1260 }, { "epoch": 7.18, "grad_norm": 2.267932176589966, "learning_rate": 1.5217228464419478e-05, "loss": 0.4046, "step": 1280 }, { "epoch": 7.3, "grad_norm": 1.5326164960861206, "learning_rate": 1.514232209737828e-05, "loss": 0.4068, "step": 1300 }, { "epoch": 7.41, "grad_norm": 3.1525979042053223, "learning_rate": 1.5067415730337081e-05, "loss": 0.3847, "step": 1320 }, { "epoch": 7.52, "grad_norm": 2.081890106201172, "learning_rate": 1.4992509363295882e-05, "loss": 0.4126, "step": 1340 }, { "epoch": 7.63, "grad_norm": 2.5701358318328857, "learning_rate": 1.4917602996254684e-05, "loss": 0.4065, "step": 1360 }, { "epoch": 7.74, "grad_norm": 1.4190051555633545, "learning_rate": 1.4842696629213485e-05, "loss": 0.3979, "step": 1380 }, { "epoch": 7.86, "grad_norm": 1.9085837602615356, "learning_rate": 1.4767790262172286e-05, "loss": 0.3894, "step": 1400 }, { "epoch": 7.97, "grad_norm": 1.7573003768920898, "learning_rate": 1.4692883895131088e-05, "loss": 0.3751, "step": 1420 }, { "epoch": 8.08, "grad_norm": 1.8974506855010986, "learning_rate": 1.4617977528089889e-05, "loss": 0.3936, "step": 1440 }, { "epoch": 8.19, "grad_norm": 1.3843660354614258, "learning_rate": 1.454307116104869e-05, "loss": 0.3848, "step": 1460 }, { "epoch": 8.31, "grad_norm": 1.525007724761963, "learning_rate": 1.4468164794007492e-05, "loss": 0.3552, "step": 1480 }, { "epoch": 8.42, "grad_norm": 2.1665101051330566, "learning_rate": 1.4393258426966291e-05, "loss": 0.3547, "step": 1500 }, { "epoch": 8.53, "grad_norm": 3.3614535331726074, "learning_rate": 1.4318352059925096e-05, "loss": 0.3771, "step": 1520 }, { "epoch": 8.64, "grad_norm": 1.746299386024475, "learning_rate": 1.4243445692883898e-05, "loss": 0.396, "step": 1540 }, { "epoch": 8.75, "grad_norm": 1.9144684076309204, "learning_rate": 1.4168539325842699e-05, "loss": 0.3748, "step": 1560 }, { "epoch": 8.87, "grad_norm": 1.9617277383804321, "learning_rate": 1.40936329588015e-05, "loss": 0.3504, "step": 1580 }, { "epoch": 8.98, "grad_norm": 2.69067645072937, "learning_rate": 1.4018726591760302e-05, "loss": 0.3477, "step": 1600 }, { "epoch": 9.09, "grad_norm": 2.142008066177368, "learning_rate": 1.3943820224719103e-05, "loss": 0.3539, "step": 1620 }, { "epoch": 9.2, "grad_norm": 1.7684266567230225, "learning_rate": 1.3868913857677904e-05, "loss": 0.3576, "step": 1640 }, { "epoch": 9.32, "grad_norm": 1.4222275018692017, "learning_rate": 1.3794007490636706e-05, "loss": 0.3839, "step": 1660 }, { "epoch": 9.43, "grad_norm": 2.0622501373291016, "learning_rate": 1.3719101123595507e-05, "loss": 0.3278, "step": 1680 }, { "epoch": 9.54, "grad_norm": 1.639147400856018, "learning_rate": 1.3644194756554308e-05, "loss": 0.3374, "step": 1700 }, { "epoch": 9.65, "grad_norm": 2.093045473098755, "learning_rate": 1.356928838951311e-05, "loss": 0.3535, "step": 1720 }, { "epoch": 9.76, "grad_norm": 1.3492937088012695, "learning_rate": 1.3494382022471911e-05, "loss": 0.3105, "step": 1740 }, { "epoch": 9.88, "grad_norm": 1.585205316543579, "learning_rate": 1.3419475655430714e-05, "loss": 0.3181, "step": 1760 }, { "epoch": 9.99, "grad_norm": 2.8895344734191895, "learning_rate": 1.3344569288389515e-05, "loss": 0.3473, "step": 1780 }, { "epoch": 10.1, "grad_norm": 1.7224748134613037, "learning_rate": 1.3269662921348317e-05, "loss": 0.3524, "step": 1800 }, { "epoch": 10.21, "grad_norm": 2.1029868125915527, "learning_rate": 1.3194756554307118e-05, "loss": 0.3408, "step": 1820 }, { "epoch": 10.33, "grad_norm": 2.434016227722168, "learning_rate": 1.311985018726592e-05, "loss": 0.3266, "step": 1840 }, { "epoch": 10.44, "grad_norm": 1.953553318977356, "learning_rate": 1.304494382022472e-05, "loss": 0.2844, "step": 1860 }, { "epoch": 10.55, "grad_norm": 2.5946218967437744, "learning_rate": 1.2970037453183522e-05, "loss": 0.3225, "step": 1880 }, { "epoch": 10.66, "grad_norm": 2.5305733680725098, "learning_rate": 1.2895131086142323e-05, "loss": 0.3183, "step": 1900 }, { "epoch": 10.78, "grad_norm": 3.56726336479187, "learning_rate": 1.2820224719101125e-05, "loss": 0.2944, "step": 1920 }, { "epoch": 10.89, "grad_norm": 1.9687740802764893, "learning_rate": 1.2745318352059926e-05, "loss": 0.3411, "step": 1940 }, { "epoch": 11.0, "grad_norm": 1.6027730703353882, "learning_rate": 1.2670411985018727e-05, "loss": 0.2949, "step": 1960 }, { "epoch": 11.11, "grad_norm": 1.8739397525787354, "learning_rate": 1.2595505617977529e-05, "loss": 0.2716, "step": 1980 }, { "epoch": 11.22, "grad_norm": 1.6741198301315308, "learning_rate": 1.2520599250936332e-05, "loss": 0.3334, "step": 2000 }, { "epoch": 11.34, "grad_norm": 1.950945496559143, "learning_rate": 1.2445692883895133e-05, "loss": 0.3291, "step": 2020 }, { "epoch": 11.45, "grad_norm": 1.9362170696258545, "learning_rate": 1.2370786516853935e-05, "loss": 0.2716, "step": 2040 }, { "epoch": 11.56, "grad_norm": 1.6201746463775635, "learning_rate": 1.2295880149812736e-05, "loss": 0.2893, "step": 2060 }, { "epoch": 11.67, "grad_norm": 3.488088607788086, "learning_rate": 1.2220973782771537e-05, "loss": 0.3239, "step": 2080 }, { "epoch": 11.79, "grad_norm": 2.4608683586120605, "learning_rate": 1.2146067415730339e-05, "loss": 0.271, "step": 2100 }, { "epoch": 11.9, "grad_norm": 1.5321098566055298, "learning_rate": 1.207116104868914e-05, "loss": 0.2876, "step": 2120 }, { "epoch": 12.01, "grad_norm": 1.8334771394729614, "learning_rate": 1.1996254681647941e-05, "loss": 0.3066, "step": 2140 }, { "epoch": 12.12, "grad_norm": 1.9506254196166992, "learning_rate": 1.1921348314606743e-05, "loss": 0.3023, "step": 2160 }, { "epoch": 12.23, "grad_norm": 2.9073598384857178, "learning_rate": 1.1846441947565544e-05, "loss": 0.3152, "step": 2180 }, { "epoch": 12.35, "grad_norm": 1.6023261547088623, "learning_rate": 1.1771535580524345e-05, "loss": 0.248, "step": 2200 }, { "epoch": 12.46, "grad_norm": 1.7954633235931396, "learning_rate": 1.1696629213483147e-05, "loss": 0.2666, "step": 2220 }, { "epoch": 12.57, "grad_norm": 2.0331828594207764, "learning_rate": 1.162172284644195e-05, "loss": 0.2878, "step": 2240 }, { "epoch": 12.68, "grad_norm": 1.656420350074768, "learning_rate": 1.1546816479400751e-05, "loss": 0.2805, "step": 2260 }, { "epoch": 12.8, "grad_norm": 1.5245873928070068, "learning_rate": 1.1471910112359552e-05, "loss": 0.2792, "step": 2280 }, { "epoch": 12.91, "grad_norm": 2.6713974475860596, "learning_rate": 1.1397003745318354e-05, "loss": 0.2841, "step": 2300 }, { "epoch": 13.02, "grad_norm": 1.268479347229004, "learning_rate": 1.1322097378277155e-05, "loss": 0.2708, "step": 2320 }, { "epoch": 13.13, "grad_norm": 2.2990434169769287, "learning_rate": 1.1247191011235956e-05, "loss": 0.2649, "step": 2340 }, { "epoch": 13.24, "grad_norm": 2.351956367492676, "learning_rate": 1.1172284644194758e-05, "loss": 0.281, "step": 2360 }, { "epoch": 13.36, "grad_norm": 1.796783208847046, "learning_rate": 1.1097378277153559e-05, "loss": 0.2725, "step": 2380 }, { "epoch": 13.47, "grad_norm": 1.7035847902297974, "learning_rate": 1.102247191011236e-05, "loss": 0.2799, "step": 2400 }, { "epoch": 13.58, "grad_norm": 2.0395431518554688, "learning_rate": 1.0947565543071162e-05, "loss": 0.239, "step": 2420 }, { "epoch": 13.69, "grad_norm": 1.8008232116699219, "learning_rate": 1.0872659176029963e-05, "loss": 0.2553, "step": 2440 }, { "epoch": 13.81, "grad_norm": 2.0559043884277344, "learning_rate": 1.0797752808988765e-05, "loss": 0.2464, "step": 2460 }, { "epoch": 13.92, "grad_norm": 1.8673292398452759, "learning_rate": 1.0722846441947568e-05, "loss": 0.2699, "step": 2480 }, { "epoch": 14.03, "grad_norm": 1.6819398403167725, "learning_rate": 1.0647940074906369e-05, "loss": 0.2566, "step": 2500 }, { "epoch": 14.14, "grad_norm": 1.9703686237335205, "learning_rate": 1.057303370786517e-05, "loss": 0.2807, "step": 2520 }, { "epoch": 14.25, "grad_norm": 2.028834819793701, "learning_rate": 1.0498127340823972e-05, "loss": 0.2392, "step": 2540 }, { "epoch": 14.37, "grad_norm": 2.2455177307128906, "learning_rate": 1.0423220973782773e-05, "loss": 0.247, "step": 2560 }, { "epoch": 14.48, "grad_norm": 1.8078291416168213, "learning_rate": 1.0348314606741574e-05, "loss": 0.2552, "step": 2580 }, { "epoch": 14.59, "grad_norm": 2.166729211807251, "learning_rate": 1.0273408239700376e-05, "loss": 0.2466, "step": 2600 }, { "epoch": 14.7, "grad_norm": 2.710556745529175, "learning_rate": 1.0198501872659177e-05, "loss": 0.2506, "step": 2620 }, { "epoch": 14.82, "grad_norm": 2.1344659328460693, "learning_rate": 1.0123595505617978e-05, "loss": 0.2388, "step": 2640 }, { "epoch": 14.93, "grad_norm": 1.595842719078064, "learning_rate": 1.004868913857678e-05, "loss": 0.2553, "step": 2660 }, { "epoch": 15.04, "grad_norm": 1.5458731651306152, "learning_rate": 9.973782771535581e-06, "loss": 0.2478, "step": 2680 }, { "epoch": 15.15, "grad_norm": 1.9514356851577759, "learning_rate": 9.898876404494382e-06, "loss": 0.234, "step": 2700 }, { "epoch": 15.26, "grad_norm": 2.1551694869995117, "learning_rate": 9.823970037453184e-06, "loss": 0.251, "step": 2720 }, { "epoch": 15.38, "grad_norm": 2.08258318901062, "learning_rate": 9.749063670411985e-06, "loss": 0.2511, "step": 2740 }, { "epoch": 15.49, "grad_norm": 1.581690788269043, "learning_rate": 9.674157303370786e-06, "loss": 0.2185, "step": 2760 }, { "epoch": 15.6, "grad_norm": 2.2121975421905518, "learning_rate": 9.599250936329588e-06, "loss": 0.2161, "step": 2780 }, { "epoch": 15.71, "grad_norm": 1.5077215433120728, "learning_rate": 9.52434456928839e-06, "loss": 0.2308, "step": 2800 }, { "epoch": 15.83, "grad_norm": 2.57951021194458, "learning_rate": 9.449438202247192e-06, "loss": 0.2299, "step": 2820 }, { "epoch": 15.94, "grad_norm": 1.6634414196014404, "learning_rate": 9.374531835205993e-06, "loss": 0.2576, "step": 2840 }, { "epoch": 16.05, "grad_norm": 1.9692113399505615, "learning_rate": 9.299625468164795e-06, "loss": 0.2395, "step": 2860 }, { "epoch": 16.16, "grad_norm": 1.9327415227890015, "learning_rate": 9.224719101123596e-06, "loss": 0.241, "step": 2880 }, { "epoch": 16.27, "grad_norm": 1.7675727605819702, "learning_rate": 9.149812734082398e-06, "loss": 0.2201, "step": 2900 }, { "epoch": 16.39, "grad_norm": 1.9511345624923706, "learning_rate": 9.074906367041199e-06, "loss": 0.2171, "step": 2920 }, { "epoch": 16.5, "grad_norm": 1.7937383651733398, "learning_rate": 9e-06, "loss": 0.2286, "step": 2940 }, { "epoch": 16.61, "grad_norm": 1.79076087474823, "learning_rate": 8.925093632958802e-06, "loss": 0.2479, "step": 2960 }, { "epoch": 16.72, "grad_norm": 2.4045145511627197, "learning_rate": 8.850187265917603e-06, "loss": 0.2153, "step": 2980 }, { "epoch": 16.84, "grad_norm": 2.1934499740600586, "learning_rate": 8.775280898876404e-06, "loss": 0.2361, "step": 3000 }, { "epoch": 16.95, "grad_norm": 1.923170804977417, "learning_rate": 8.700374531835206e-06, "loss": 0.2146, "step": 3020 }, { "epoch": 17.06, "grad_norm": 2.1610753536224365, "learning_rate": 8.625468164794009e-06, "loss": 0.2281, "step": 3040 }, { "epoch": 17.17, "grad_norm": 2.1105706691741943, "learning_rate": 8.55056179775281e-06, "loss": 0.2403, "step": 3060 }, { "epoch": 17.29, "grad_norm": 1.979177474975586, "learning_rate": 8.475655430711611e-06, "loss": 0.1734, "step": 3080 }, { "epoch": 17.4, "grad_norm": 2.040055274963379, "learning_rate": 8.400749063670413e-06, "loss": 0.2393, "step": 3100 }, { "epoch": 17.51, "grad_norm": 1.8687106370925903, "learning_rate": 8.325842696629214e-06, "loss": 0.2346, "step": 3120 }, { "epoch": 17.62, "grad_norm": 1.7447230815887451, "learning_rate": 8.250936329588015e-06, "loss": 0.2279, "step": 3140 }, { "epoch": 17.73, "grad_norm": 2.9035825729370117, "learning_rate": 8.176029962546818e-06, "loss": 0.2049, "step": 3160 }, { "epoch": 17.85, "grad_norm": 2.1024608612060547, "learning_rate": 8.101123595505618e-06, "loss": 0.1962, "step": 3180 }, { "epoch": 17.96, "grad_norm": 2.7913131713867188, "learning_rate": 8.02621722846442e-06, "loss": 0.2081, "step": 3200 }, { "epoch": 18.07, "grad_norm": 2.0668814182281494, "learning_rate": 7.95131086142322e-06, "loss": 0.2304, "step": 3220 }, { "epoch": 18.18, "grad_norm": 1.7872204780578613, "learning_rate": 7.876404494382022e-06, "loss": 0.1804, "step": 3240 }, { "epoch": 18.3, "grad_norm": 2.0718905925750732, "learning_rate": 7.801498127340823e-06, "loss": 0.2232, "step": 3260 }, { "epoch": 18.41, "grad_norm": 3.835952043533325, "learning_rate": 7.726591760299626e-06, "loss": 0.2171, "step": 3280 }, { "epoch": 18.52, "grad_norm": 1.5925731658935547, "learning_rate": 7.651685393258428e-06, "loss": 0.1999, "step": 3300 }, { "epoch": 18.63, "grad_norm": 2.434159994125366, "learning_rate": 7.576779026217229e-06, "loss": 0.1876, "step": 3320 }, { "epoch": 18.74, "grad_norm": 2.3486499786376953, "learning_rate": 7.5018726591760305e-06, "loss": 0.21, "step": 3340 }, { "epoch": 18.86, "grad_norm": 1.4824186563491821, "learning_rate": 7.426966292134832e-06, "loss": 0.2239, "step": 3360 }, { "epoch": 18.97, "grad_norm": 2.062422275543213, "learning_rate": 7.352059925093633e-06, "loss": 0.22, "step": 3380 }, { "epoch": 19.08, "grad_norm": 2.0563416481018066, "learning_rate": 7.277153558052435e-06, "loss": 0.1945, "step": 3400 }, { "epoch": 19.19, "grad_norm": 1.6936135292053223, "learning_rate": 7.202247191011237e-06, "loss": 0.217, "step": 3420 }, { "epoch": 19.31, "grad_norm": 1.9931917190551758, "learning_rate": 7.127340823970038e-06, "loss": 0.2127, "step": 3440 }, { "epoch": 19.42, "grad_norm": 1.5989198684692383, "learning_rate": 7.0524344569288395e-06, "loss": 0.1849, "step": 3460 }, { "epoch": 19.53, "grad_norm": 2.0073723793029785, "learning_rate": 6.977528089887641e-06, "loss": 0.1805, "step": 3480 }, { "epoch": 19.64, "grad_norm": 1.9756735563278198, "learning_rate": 6.902621722846442e-06, "loss": 0.1963, "step": 3500 }, { "epoch": 19.75, "grad_norm": 1.5112028121948242, "learning_rate": 6.827715355805244e-06, "loss": 0.2008, "step": 3520 }, { "epoch": 19.87, "grad_norm": 2.2792975902557373, "learning_rate": 6.752808988764046e-06, "loss": 0.2106, "step": 3540 }, { "epoch": 19.98, "grad_norm": 2.768470048904419, "learning_rate": 6.677902621722847e-06, "loss": 0.2066, "step": 3560 }, { "epoch": 20.09, "grad_norm": 1.6916066408157349, "learning_rate": 6.602996254681648e-06, "loss": 0.1912, "step": 3580 }, { "epoch": 20.2, "grad_norm": 1.7649778127670288, "learning_rate": 6.52808988764045e-06, "loss": 0.2027, "step": 3600 }, { "epoch": 20.32, "grad_norm": 1.9743694067001343, "learning_rate": 6.453183520599251e-06, "loss": 0.2208, "step": 3620 }, { "epoch": 20.43, "grad_norm": 1.827344298362732, "learning_rate": 6.378277153558053e-06, "loss": 0.1757, "step": 3640 }, { "epoch": 20.54, "grad_norm": 2.7847957611083984, "learning_rate": 6.303370786516855e-06, "loss": 0.1931, "step": 3660 }, { "epoch": 20.65, "grad_norm": 1.8572605848312378, "learning_rate": 6.228464419475656e-06, "loss": 0.1902, "step": 3680 }, { "epoch": 20.76, "grad_norm": 1.5343818664550781, "learning_rate": 6.153558052434457e-06, "loss": 0.1916, "step": 3700 }, { "epoch": 20.88, "grad_norm": 1.703688383102417, "learning_rate": 6.078651685393259e-06, "loss": 0.1897, "step": 3720 }, { "epoch": 20.99, "grad_norm": 2.5442187786102295, "learning_rate": 6.00374531835206e-06, "loss": 0.1859, "step": 3740 }, { "epoch": 21.1, "grad_norm": 2.0333402156829834, "learning_rate": 5.928838951310862e-06, "loss": 0.1632, "step": 3760 }, { "epoch": 21.21, "grad_norm": 2.107227087020874, "learning_rate": 5.8539325842696635e-06, "loss": 0.2031, "step": 3780 }, { "epoch": 21.33, "grad_norm": 2.0351223945617676, "learning_rate": 5.779026217228465e-06, "loss": 0.1759, "step": 3800 }, { "epoch": 21.44, "grad_norm": 2.1328284740448, "learning_rate": 5.704119850187266e-06, "loss": 0.1853, "step": 3820 }, { "epoch": 21.55, "grad_norm": 2.0145580768585205, "learning_rate": 5.629213483146068e-06, "loss": 0.1919, "step": 3840 }, { "epoch": 21.66, "grad_norm": 1.8794372081756592, "learning_rate": 5.554307116104869e-06, "loss": 0.1958, "step": 3860 }, { "epoch": 21.77, "grad_norm": 1.8487616777420044, "learning_rate": 5.479400749063671e-06, "loss": 0.207, "step": 3880 }, { "epoch": 21.89, "grad_norm": 2.080965042114258, "learning_rate": 5.4044943820224725e-06, "loss": 0.1715, "step": 3900 }, { "epoch": 22.0, "grad_norm": 2.0303232669830322, "learning_rate": 5.329588014981274e-06, "loss": 0.1873, "step": 3920 }, { "epoch": 22.11, "grad_norm": 2.1078438758850098, "learning_rate": 5.254681647940075e-06, "loss": 0.1869, "step": 3940 }, { "epoch": 22.22, "grad_norm": 1.8502501249313354, "learning_rate": 5.1797752808988765e-06, "loss": 0.1983, "step": 3960 }, { "epoch": 22.34, "grad_norm": 2.209162950515747, "learning_rate": 5.104868913857678e-06, "loss": 0.185, "step": 3980 }, { "epoch": 22.45, "grad_norm": 1.9525928497314453, "learning_rate": 5.02996254681648e-06, "loss": 0.1943, "step": 4000 } ], "logging_steps": 20, "max_steps": 5340, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "total_flos": 1.03965111484416e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }