{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997726239199636, "eval_steps": 500, "global_step": 3708, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008084482845737962, "grad_norm": 10.217898134986712, "learning_rate": 1.7857142857142858e-07, "loss": 1.7929, "step": 1 }, { "epoch": 0.008084482845737961, "grad_norm": 8.447331849071613, "learning_rate": 1.7857142857142859e-06, "loss": 1.8062, "step": 10 }, { "epoch": 0.016168965691475922, "grad_norm": 3.3131831113560053, "learning_rate": 3.5714285714285718e-06, "loss": 1.6026, "step": 20 }, { "epoch": 0.024253448537213885, "grad_norm": 1.9123288015392683, "learning_rate": 5.357142857142857e-06, "loss": 1.5044, "step": 30 }, { "epoch": 0.032337931382951844, "grad_norm": 1.5061470360359468, "learning_rate": 7.1428571428571436e-06, "loss": 1.46, "step": 40 }, { "epoch": 0.04042241422868981, "grad_norm": 1.3730571252951111, "learning_rate": 8.92857142857143e-06, "loss": 1.4242, "step": 50 }, { "epoch": 0.04850689707442777, "grad_norm": 1.3696793978053872, "learning_rate": 1.0714285714285714e-05, "loss": 1.3962, "step": 60 }, { "epoch": 0.05659137992016573, "grad_norm": 1.3254100387294059, "learning_rate": 1.25e-05, "loss": 1.4019, "step": 70 }, { "epoch": 0.06467586276590369, "grad_norm": 1.300617283886077, "learning_rate": 1.4285714285714287e-05, "loss": 1.38, "step": 80 }, { "epoch": 0.07276034561164166, "grad_norm": 1.2890020177179862, "learning_rate": 1.6071428571428572e-05, "loss": 1.3692, "step": 90 }, { "epoch": 0.08084482845737961, "grad_norm": 1.3617464980600442, "learning_rate": 1.785714285714286e-05, "loss": 1.3717, "step": 100 }, { "epoch": 0.08892931130311758, "grad_norm": 1.3364658238171299, "learning_rate": 1.9642857142857145e-05, "loss": 1.3445, "step": 110 }, { "epoch": 0.09701379414885554, "grad_norm": 1.406654933905743, "learning_rate": 1.999975576461237e-05, "loss": 1.3724, "step": 120 }, { "epoch": 0.1050982769945935, "grad_norm": 1.4611284363694028, "learning_rate": 1.999876357879684e-05, "loss": 1.3446, "step": 130 }, { "epoch": 0.11318275984033146, "grad_norm": 1.3711888479566774, "learning_rate": 1.9997008253510416e-05, "loss": 1.3515, "step": 140 }, { "epoch": 0.12126724268606942, "grad_norm": 1.409811295994504, "learning_rate": 1.9994489922725454e-05, "loss": 1.342, "step": 150 }, { "epoch": 0.12935172553180738, "grad_norm": 1.3918093353079757, "learning_rate": 1.9991208778649485e-05, "loss": 1.3493, "step": 160 }, { "epoch": 0.13743620837754536, "grad_norm": 1.3597096387911873, "learning_rate": 1.998716507171053e-05, "loss": 1.3186, "step": 170 }, { "epoch": 0.14552069122328332, "grad_norm": 1.3532852407233782, "learning_rate": 1.998235911053798e-05, "loss": 1.3426, "step": 180 }, { "epoch": 0.15360517406902127, "grad_norm": 1.395929355438092, "learning_rate": 1.9976791261939064e-05, "loss": 1.338, "step": 190 }, { "epoch": 0.16168965691475923, "grad_norm": 1.3346897807354206, "learning_rate": 1.997046195087082e-05, "loss": 1.3209, "step": 200 }, { "epoch": 0.16977413976049718, "grad_norm": 1.3501622187451188, "learning_rate": 1.996337166040769e-05, "loss": 1.3279, "step": 210 }, { "epoch": 0.17785862260623517, "grad_norm": 1.262079121683393, "learning_rate": 1.995552093170463e-05, "loss": 1.3135, "step": 220 }, { "epoch": 0.18594310545197312, "grad_norm": 1.324883144834464, "learning_rate": 1.994691036395583e-05, "loss": 1.306, "step": 230 }, { "epoch": 0.19402758829771108, "grad_norm": 1.373867783740766, "learning_rate": 1.9937540614348944e-05, "loss": 1.3018, "step": 240 }, { "epoch": 0.20211207114344903, "grad_norm": 1.4020861161362783, "learning_rate": 1.992741239801498e-05, "loss": 1.3203, "step": 250 }, { "epoch": 0.210196553989187, "grad_norm": 1.3484650757297245, "learning_rate": 1.9916526487973678e-05, "loss": 1.2939, "step": 260 }, { "epoch": 0.21828103683492497, "grad_norm": 1.3330965331306333, "learning_rate": 1.9904883715074525e-05, "loss": 1.2795, "step": 270 }, { "epoch": 0.22636551968066293, "grad_norm": 1.3917589397233552, "learning_rate": 1.989248496793335e-05, "loss": 1.269, "step": 280 }, { "epoch": 0.23445000252640089, "grad_norm": 1.3905412148367542, "learning_rate": 1.9879331192864492e-05, "loss": 1.286, "step": 290 }, { "epoch": 0.24253448537213884, "grad_norm": 1.4569325197708967, "learning_rate": 1.9865423393808573e-05, "loss": 1.2944, "step": 300 }, { "epoch": 0.2506189682178768, "grad_norm": 1.3399495909594208, "learning_rate": 1.985076263225588e-05, "loss": 1.3106, "step": 310 }, { "epoch": 0.25870345106361475, "grad_norm": 1.478802579336813, "learning_rate": 1.9835350027165342e-05, "loss": 1.2994, "step": 320 }, { "epoch": 0.26678793390935274, "grad_norm": 1.3105244819439577, "learning_rate": 1.9819186754879137e-05, "loss": 1.2871, "step": 330 }, { "epoch": 0.2748724167550907, "grad_norm": 1.3667119896120177, "learning_rate": 1.9802274049032898e-05, "loss": 1.2893, "step": 340 }, { "epoch": 0.28295689960082865, "grad_norm": 1.5054526064910085, "learning_rate": 1.9784613200461568e-05, "loss": 1.2912, "step": 350 }, { "epoch": 0.29104138244656663, "grad_norm": 1.3163243486049039, "learning_rate": 1.976620555710087e-05, "loss": 1.2761, "step": 360 }, { "epoch": 0.29912586529230456, "grad_norm": 1.322920539242633, "learning_rate": 1.9747052523884435e-05, "loss": 1.2572, "step": 370 }, { "epoch": 0.30721034813804254, "grad_norm": 1.3954468357326724, "learning_rate": 1.972715556263657e-05, "loss": 1.2745, "step": 380 }, { "epoch": 0.3152948309837805, "grad_norm": 1.3451929159695755, "learning_rate": 1.9706516191960687e-05, "loss": 1.2472, "step": 390 }, { "epoch": 0.32337931382951846, "grad_norm": 1.2765565775996142, "learning_rate": 1.9685135987123396e-05, "loss": 1.255, "step": 400 }, { "epoch": 0.33146379667525644, "grad_norm": 1.4632541877317655, "learning_rate": 1.966301657993428e-05, "loss": 1.2565, "step": 410 }, { "epoch": 0.33954827952099437, "grad_norm": 1.3554436136314076, "learning_rate": 1.9640159658621344e-05, "loss": 1.2593, "step": 420 }, { "epoch": 0.34763276236673235, "grad_norm": 1.3154961346767526, "learning_rate": 1.9616566967702164e-05, "loss": 1.2604, "step": 430 }, { "epoch": 0.35571724521247033, "grad_norm": 1.3833700211512812, "learning_rate": 1.9592240307850748e-05, "loss": 1.2625, "step": 440 }, { "epoch": 0.36380172805820826, "grad_norm": 1.2812641775550833, "learning_rate": 1.95671815357601e-05, "loss": 1.2661, "step": 450 }, { "epoch": 0.37188621090394625, "grad_norm": 1.3509908047727408, "learning_rate": 1.954139256400049e-05, "loss": 1.2448, "step": 460 }, { "epoch": 0.3799706937496842, "grad_norm": 1.356891388574271, "learning_rate": 1.951487536087352e-05, "loss": 1.2551, "step": 470 }, { "epoch": 0.38805517659542216, "grad_norm": 1.2921423460134738, "learning_rate": 1.948763195026186e-05, "loss": 1.2503, "step": 480 }, { "epoch": 0.39613965944116014, "grad_norm": 1.3494188641362486, "learning_rate": 1.9459664411474793e-05, "loss": 1.2509, "step": 490 }, { "epoch": 0.40422414228689807, "grad_norm": 1.336605272931222, "learning_rate": 1.9430974879089522e-05, "loss": 1.251, "step": 500 }, { "epoch": 0.41230862513263605, "grad_norm": 1.3167568815144604, "learning_rate": 1.9401565542788238e-05, "loss": 1.2341, "step": 510 }, { "epoch": 0.420393107978374, "grad_norm": 1.3704112316871029, "learning_rate": 1.9371438647191007e-05, "loss": 1.2483, "step": 520 }, { "epoch": 0.42847759082411196, "grad_norm": 1.2971253486447214, "learning_rate": 1.9340596491684443e-05, "loss": 1.2483, "step": 530 }, { "epoch": 0.43656207366984995, "grad_norm": 1.278671915851734, "learning_rate": 1.9309041430246228e-05, "loss": 1.247, "step": 540 }, { "epoch": 0.4446465565155879, "grad_norm": 1.7143062654632688, "learning_rate": 1.927677587126542e-05, "loss": 1.2582, "step": 550 }, { "epoch": 0.45273103936132586, "grad_norm": 3.071400396021207, "learning_rate": 1.924380227735867e-05, "loss": 1.2369, "step": 560 }, { "epoch": 0.46081552220706384, "grad_norm": 1.3043426791795303, "learning_rate": 1.921012316518224e-05, "loss": 1.2564, "step": 570 }, { "epoch": 0.46890000505280177, "grad_norm": 1.4606599266501914, "learning_rate": 1.917574110523994e-05, "loss": 1.2455, "step": 580 }, { "epoch": 0.47698448789853976, "grad_norm": 1.4084512281248918, "learning_rate": 1.914065872168692e-05, "loss": 1.237, "step": 590 }, { "epoch": 0.4850689707442777, "grad_norm": 1.7442614101619536, "learning_rate": 1.910487869212942e-05, "loss": 1.2428, "step": 600 }, { "epoch": 0.49315345359001567, "grad_norm": 3.217405569169141, "learning_rate": 1.9068403747420365e-05, "loss": 1.2406, "step": 610 }, { "epoch": 0.5012379364357537, "grad_norm": 2.028319411000034, "learning_rate": 1.9031236671450963e-05, "loss": 1.2295, "step": 620 }, { "epoch": 0.5093224192814916, "grad_norm": 1.3534016099705948, "learning_rate": 1.899338030093822e-05, "loss": 1.2287, "step": 630 }, { "epoch": 0.5174069021272295, "grad_norm": 16.560363141349324, "learning_rate": 1.8954837525208432e-05, "loss": 1.2239, "step": 640 }, { "epoch": 0.5254913849729675, "grad_norm": 1.6507428977016882, "learning_rate": 1.8915611285976672e-05, "loss": 1.2122, "step": 650 }, { "epoch": 0.5335758678187055, "grad_norm": 1.4599892786481696, "learning_rate": 1.887570457712225e-05, "loss": 1.2448, "step": 660 }, { "epoch": 0.5416603506644434, "grad_norm": 1.3576328654723246, "learning_rate": 1.883512044446023e-05, "loss": 1.235, "step": 670 }, { "epoch": 0.5497448335101814, "grad_norm": 2.7275818350144383, "learning_rate": 1.879386198550895e-05, "loss": 1.2302, "step": 680 }, { "epoch": 0.5578293163559194, "grad_norm": 1.4783688191374078, "learning_rate": 1.8751932349253595e-05, "loss": 1.2183, "step": 690 }, { "epoch": 0.5659137992016573, "grad_norm": 1.3696848099680126, "learning_rate": 1.8709334735905908e-05, "loss": 1.2202, "step": 700 }, { "epoch": 0.5739982820473952, "grad_norm": 1.3843064222445587, "learning_rate": 1.866607239665988e-05, "loss": 1.2292, "step": 710 }, { "epoch": 0.5820827648931333, "grad_norm": 1.3013446345815274, "learning_rate": 1.8622148633443626e-05, "loss": 1.2404, "step": 720 }, { "epoch": 0.5901672477388712, "grad_norm": 1.3389494076775972, "learning_rate": 1.8577566798667397e-05, "loss": 1.2, "step": 730 }, { "epoch": 0.5982517305846091, "grad_norm": 1.2803553653933784, "learning_rate": 1.8532330294967678e-05, "loss": 1.2019, "step": 740 }, { "epoch": 0.6063362134303472, "grad_norm": 1.3940783442430897, "learning_rate": 1.848644257494751e-05, "loss": 1.2111, "step": 750 }, { "epoch": 0.6144206962760851, "grad_norm": 1.2967372912925752, "learning_rate": 1.8439907140912962e-05, "loss": 1.2044, "step": 760 }, { "epoch": 0.622505179121823, "grad_norm": 1.307050777866234, "learning_rate": 1.839272754460583e-05, "loss": 1.211, "step": 770 }, { "epoch": 0.630589661967561, "grad_norm": 1.7851865803650349, "learning_rate": 1.8344907386932552e-05, "loss": 1.2038, "step": 780 }, { "epoch": 0.638674144813299, "grad_norm": 1.8614266164299924, "learning_rate": 1.8296450317689377e-05, "loss": 1.2054, "step": 790 }, { "epoch": 0.6467586276590369, "grad_norm": 1.3262638540650757, "learning_rate": 1.824736003528381e-05, "loss": 1.209, "step": 800 }, { "epoch": 0.654843110504775, "grad_norm": 1.290793353111858, "learning_rate": 1.8197640286452312e-05, "loss": 1.213, "step": 810 }, { "epoch": 0.6629275933505129, "grad_norm": 1.2558226934999566, "learning_rate": 1.814729486597436e-05, "loss": 1.2266, "step": 820 }, { "epoch": 0.6710120761962508, "grad_norm": 1.277465841944589, "learning_rate": 1.8096327616382815e-05, "loss": 1.2167, "step": 830 }, { "epoch": 0.6790965590419887, "grad_norm": 1.298887855615747, "learning_rate": 1.8044742427670627e-05, "loss": 1.2226, "step": 840 }, { "epoch": 0.6871810418877268, "grad_norm": 5.857168222574854, "learning_rate": 1.7992543236993952e-05, "loss": 1.2027, "step": 850 }, { "epoch": 0.6952655247334647, "grad_norm": 1.3361306728189393, "learning_rate": 1.7939734028371663e-05, "loss": 1.207, "step": 860 }, { "epoch": 0.7033500075792026, "grad_norm": 1.3969769044659528, "learning_rate": 1.7886318832381264e-05, "loss": 1.1799, "step": 870 }, { "epoch": 0.7114344904249407, "grad_norm": 1.4266930108547686, "learning_rate": 1.783230172585126e-05, "loss": 1.2111, "step": 880 }, { "epoch": 0.7195189732706786, "grad_norm": 1.3440902999919684, "learning_rate": 1.7777686831550008e-05, "loss": 1.1854, "step": 890 }, { "epoch": 0.7276034561164165, "grad_norm": 1.251718689797153, "learning_rate": 1.7722478317871053e-05, "loss": 1.1803, "step": 900 }, { "epoch": 0.7356879389621546, "grad_norm": 1.2756808323680056, "learning_rate": 1.7666680398514978e-05, "loss": 1.2148, "step": 910 }, { "epoch": 0.7437724218078925, "grad_norm": 1.3774590120848857, "learning_rate": 1.76102973321678e-05, "loss": 1.189, "step": 920 }, { "epoch": 0.7518569046536304, "grad_norm": 1.5207360711907143, "learning_rate": 1.7553333422175933e-05, "loss": 1.1819, "step": 930 }, { "epoch": 0.7599413874993683, "grad_norm": 1.302009300658742, "learning_rate": 1.7495793016217754e-05, "loss": 1.191, "step": 940 }, { "epoch": 0.7680258703451064, "grad_norm": 1.3859954985668783, "learning_rate": 1.743768050597175e-05, "loss": 1.1835, "step": 950 }, { "epoch": 0.7761103531908443, "grad_norm": 1.3435502591474426, "learning_rate": 1.7379000326781348e-05, "loss": 1.2035, "step": 960 }, { "epoch": 0.7841948360365822, "grad_norm": 1.38981939520544, "learning_rate": 1.7319756957316392e-05, "loss": 1.1887, "step": 970 }, { "epoch": 0.7922793188823203, "grad_norm": 1.4015519572670776, "learning_rate": 1.725995491923131e-05, "loss": 1.1843, "step": 980 }, { "epoch": 0.8003638017280582, "grad_norm": 1.4763071143801054, "learning_rate": 1.7199598776820013e-05, "loss": 1.1753, "step": 990 }, { "epoch": 0.8084482845737961, "grad_norm": 1.3577477544239007, "learning_rate": 1.713869313666753e-05, "loss": 1.1966, "step": 1000 }, { "epoch": 0.8165327674195342, "grad_norm": 1.3963231420568032, "learning_rate": 1.7077242647298405e-05, "loss": 1.1985, "step": 1010 }, { "epoch": 0.8246172502652721, "grad_norm": 1.5498623314696613, "learning_rate": 1.7015251998821938e-05, "loss": 1.1785, "step": 1020 }, { "epoch": 0.83270173311101, "grad_norm": 1.3586468512222978, "learning_rate": 1.6952725922574188e-05, "loss": 1.1648, "step": 1030 }, { "epoch": 0.840786215956748, "grad_norm": 1.4300342736321576, "learning_rate": 1.688966919075687e-05, "loss": 1.1666, "step": 1040 }, { "epoch": 0.848870698802486, "grad_norm": 1.5788283624417567, "learning_rate": 1.682608661607313e-05, "loss": 1.1821, "step": 1050 }, { "epoch": 0.8569551816482239, "grad_norm": 1.359570582214726, "learning_rate": 1.6761983051360232e-05, "loss": 1.1958, "step": 1060 }, { "epoch": 0.8650396644939619, "grad_norm": 1.3046392847858388, "learning_rate": 1.6697363389219147e-05, "loss": 1.1557, "step": 1070 }, { "epoch": 0.8731241473396999, "grad_norm": 1.4677129965264875, "learning_rate": 1.6632232561641158e-05, "loss": 1.1593, "step": 1080 }, { "epoch": 0.8812086301854378, "grad_norm": 1.4859252531152671, "learning_rate": 1.6566595539631417e-05, "loss": 1.1753, "step": 1090 }, { "epoch": 0.8892931130311758, "grad_norm": 1.3209365154297203, "learning_rate": 1.6500457332829553e-05, "loss": 1.161, "step": 1100 }, { "epoch": 0.8973775958769138, "grad_norm": 1.3862159117294945, "learning_rate": 1.6433822989127314e-05, "loss": 1.1592, "step": 1110 }, { "epoch": 0.9054620787226517, "grad_norm": 1.4456179949854164, "learning_rate": 1.636669759428329e-05, "loss": 1.1484, "step": 1120 }, { "epoch": 0.9135465615683896, "grad_norm": 1.288756152636894, "learning_rate": 1.6299086271534764e-05, "loss": 1.181, "step": 1130 }, { "epoch": 0.9216310444141277, "grad_norm": 1.2599229391965052, "learning_rate": 1.6230994181206674e-05, "loss": 1.1718, "step": 1140 }, { "epoch": 0.9297155272598656, "grad_norm": 1.4973902946133841, "learning_rate": 1.6162426520317765e-05, "loss": 1.1773, "step": 1150 }, { "epoch": 0.9378000101056035, "grad_norm": 1.3698767908727083, "learning_rate": 1.6093388522183948e-05, "loss": 1.1666, "step": 1160 }, { "epoch": 0.9458844929513415, "grad_norm": 1.386433062647111, "learning_rate": 1.6023885456018852e-05, "loss": 1.1859, "step": 1170 }, { "epoch": 0.9539689757970795, "grad_norm": 1.284904254015402, "learning_rate": 1.595392262653168e-05, "loss": 1.1906, "step": 1180 }, { "epoch": 0.9620534586428174, "grad_norm": 1.4402131637475677, "learning_rate": 1.5883505373522317e-05, "loss": 1.1593, "step": 1190 }, { "epoch": 0.9701379414885554, "grad_norm": 1.6049356540049453, "learning_rate": 1.5812639071473804e-05, "loss": 1.1636, "step": 1200 }, { "epoch": 0.9782224243342934, "grad_norm": 1.505036374645861, "learning_rate": 1.574132912914211e-05, "loss": 1.14, "step": 1210 }, { "epoch": 0.9863069071800313, "grad_norm": 1.6280895974825729, "learning_rate": 1.566958098914334e-05, "loss": 1.1358, "step": 1220 }, { "epoch": 0.9943913900257693, "grad_norm": 1.2574161457807662, "learning_rate": 1.5597400127538324e-05, "loss": 1.1754, "step": 1230 }, { "epoch": 0.9992420797332121, "eval_loss": 1.0555766820907593, "eval_runtime": 476.758, "eval_samples_per_second": 25.514, "eval_steps_per_second": 12.757, "step": 1236 }, { "epoch": 1.0024758728715073, "grad_norm": 2.9356360899500897, "learning_rate": 1.5524792053414676e-05, "loss": 1.1182, "step": 1240 }, { "epoch": 1.0105603557172451, "grad_norm": 1.4115997260524025, "learning_rate": 1.5451762308466302e-05, "loss": 1.0448, "step": 1250 }, { "epoch": 1.0186448385629832, "grad_norm": 1.4408354404654395, "learning_rate": 1.5378316466570466e-05, "loss": 1.027, "step": 1260 }, { "epoch": 1.0267293214087212, "grad_norm": 1.40209737150782, "learning_rate": 1.530446013336235e-05, "loss": 1.0253, "step": 1270 }, { "epoch": 1.034813804254459, "grad_norm": 1.4050923085204698, "learning_rate": 1.5230198945807226e-05, "loss": 1.0596, "step": 1280 }, { "epoch": 1.042898287100197, "grad_norm": 1.3850604464116953, "learning_rate": 1.515553857177022e-05, "loss": 1.0354, "step": 1290 }, { "epoch": 1.050982769945935, "grad_norm": 1.6192982769908866, "learning_rate": 1.5080484709583715e-05, "loss": 1.0338, "step": 1300 }, { "epoch": 1.059067252791673, "grad_norm": 1.5443333411983042, "learning_rate": 1.5005043087612452e-05, "loss": 1.0224, "step": 1310 }, { "epoch": 1.067151735637411, "grad_norm": 1.4795375887873081, "learning_rate": 1.4929219463816302e-05, "loss": 1.0273, "step": 1320 }, { "epoch": 1.075236218483149, "grad_norm": 1.3952469643942318, "learning_rate": 1.4853019625310813e-05, "loss": 1.0165, "step": 1330 }, { "epoch": 1.0833207013288868, "grad_norm": 1.4102438583126526, "learning_rate": 1.4776449387925507e-05, "loss": 1.0323, "step": 1340 }, { "epoch": 1.0914051841746248, "grad_norm": 1.4166513317270177, "learning_rate": 1.4699514595760006e-05, "loss": 1.0343, "step": 1350 }, { "epoch": 1.0994896670203629, "grad_norm": 1.4572773218335806, "learning_rate": 1.4622221120737985e-05, "loss": 1.0449, "step": 1360 }, { "epoch": 1.1075741498661007, "grad_norm": 1.4277575864922984, "learning_rate": 1.4544574862159013e-05, "loss": 1.0157, "step": 1370 }, { "epoch": 1.1156586327118387, "grad_norm": 1.8246683293221693, "learning_rate": 1.446658174624829e-05, "loss": 1.037, "step": 1380 }, { "epoch": 1.1237431155575768, "grad_norm": 1.4515508954548648, "learning_rate": 1.4388247725704338e-05, "loss": 1.0163, "step": 1390 }, { "epoch": 1.1318275984033146, "grad_norm": 1.4472625641065484, "learning_rate": 1.4309578779244678e-05, "loss": 1.0339, "step": 1400 }, { "epoch": 1.1399120812490526, "grad_norm": 1.441284439472294, "learning_rate": 1.423058091114951e-05, "loss": 1.0153, "step": 1410 }, { "epoch": 1.1479965640947905, "grad_norm": 1.4505444065925723, "learning_rate": 1.4151260150803445e-05, "loss": 1.0413, "step": 1420 }, { "epoch": 1.1560810469405285, "grad_norm": 1.5566575848024742, "learning_rate": 1.4071622552235327e-05, "loss": 1.014, "step": 1430 }, { "epoch": 1.1641655297862665, "grad_norm": 1.476527456836737, "learning_rate": 1.399167419365616e-05, "loss": 1.0374, "step": 1440 }, { "epoch": 1.1722500126320043, "grad_norm": 1.7587555981022083, "learning_rate": 1.3911421176995206e-05, "loss": 1.0145, "step": 1450 }, { "epoch": 1.1803344954777424, "grad_norm": 1.5447530212974045, "learning_rate": 1.3830869627434267e-05, "loss": 1.0104, "step": 1460 }, { "epoch": 1.1884189783234804, "grad_norm": 1.368002967716879, "learning_rate": 1.3750025692940174e-05, "loss": 1.0102, "step": 1470 }, { "epoch": 1.1965034611692182, "grad_norm": 1.5132346329088506, "learning_rate": 1.3668895543795581e-05, "loss": 1.0241, "step": 1480 }, { "epoch": 1.2045879440149563, "grad_norm": 1.4535090384504317, "learning_rate": 1.3587485372128e-05, "loss": 1.01, "step": 1490 }, { "epoch": 1.2126724268606943, "grad_norm": 1.6349536867702466, "learning_rate": 1.3505801391437215e-05, "loss": 1.0538, "step": 1500 }, { "epoch": 1.2207569097064321, "grad_norm": 1.608679365926187, "learning_rate": 1.3423849836121043e-05, "loss": 1.0256, "step": 1510 }, { "epoch": 1.2288413925521702, "grad_norm": 1.4875509565909706, "learning_rate": 1.33416369609995e-05, "loss": 1.0365, "step": 1520 }, { "epoch": 1.2369258753979082, "grad_norm": 1.4161399144655036, "learning_rate": 1.325916904083741e-05, "loss": 1.0285, "step": 1530 }, { "epoch": 1.245010358243646, "grad_norm": 1.516547180031239, "learning_rate": 1.3176452369865504e-05, "loss": 0.9972, "step": 1540 }, { "epoch": 1.253094841089384, "grad_norm": 1.4500310981963098, "learning_rate": 1.3093493261300012e-05, "loss": 1.0122, "step": 1550 }, { "epoch": 1.261179323935122, "grad_norm": 1.3787551364346502, "learning_rate": 1.3010298046860821e-05, "loss": 1.0221, "step": 1560 }, { "epoch": 1.26926380678086, "grad_norm": 1.3579456863416077, "learning_rate": 1.2926873076288222e-05, "loss": 1.0213, "step": 1570 }, { "epoch": 1.277348289626598, "grad_norm": 1.4774509503134268, "learning_rate": 1.2843224716858271e-05, "loss": 1.012, "step": 1580 }, { "epoch": 1.285432772472336, "grad_norm": 1.4805342986177266, "learning_rate": 1.2759359352896809e-05, "loss": 1.0193, "step": 1590 }, { "epoch": 1.2935172553180738, "grad_norm": 1.4527468028008124, "learning_rate": 1.2675283385292212e-05, "loss": 1.0431, "step": 1600 }, { "epoch": 1.3016017381638119, "grad_norm": 1.5688075844044822, "learning_rate": 1.259100323100682e-05, "loss": 1.0226, "step": 1610 }, { "epoch": 1.30968622100955, "grad_norm": 1.493324687221304, "learning_rate": 1.2506525322587207e-05, "loss": 0.9966, "step": 1620 }, { "epoch": 1.3177707038552877, "grad_norm": 1.563824009098089, "learning_rate": 1.2421856107673205e-05, "loss": 1.0317, "step": 1630 }, { "epoch": 1.3258551867010258, "grad_norm": 1.4698666764020467, "learning_rate": 1.233700204850581e-05, "loss": 1.0013, "step": 1640 }, { "epoch": 1.3339396695467638, "grad_norm": 1.625463847709757, "learning_rate": 1.2251969621433947e-05, "loss": 1.0233, "step": 1650 }, { "epoch": 1.3420241523925016, "grad_norm": 1.560576858468798, "learning_rate": 1.2166765316420195e-05, "loss": 1.0137, "step": 1660 }, { "epoch": 1.3501086352382397, "grad_norm": 1.6305115869655395, "learning_rate": 1.2081395636545432e-05, "loss": 1.0074, "step": 1670 }, { "epoch": 1.3581931180839777, "grad_norm": 1.683367869903662, "learning_rate": 1.1995867097512504e-05, "loss": 1.0202, "step": 1680 }, { "epoch": 1.3662776009297155, "grad_norm": 1.342629975477622, "learning_rate": 1.191018622714893e-05, "loss": 1.0039, "step": 1690 }, { "epoch": 1.3743620837754535, "grad_norm": 1.4162506108365653, "learning_rate": 1.1824359564908667e-05, "loss": 1.0303, "step": 1700 }, { "epoch": 1.3824465666211916, "grad_norm": 1.4322509952288762, "learning_rate": 1.1738393661373004e-05, "loss": 1.0223, "step": 1710 }, { "epoch": 1.3905310494669294, "grad_norm": 1.4429525488762647, "learning_rate": 1.1652295077750599e-05, "loss": 1.0079, "step": 1720 }, { "epoch": 1.3986155323126674, "grad_norm": 1.5044521870868257, "learning_rate": 1.1566070385376705e-05, "loss": 0.9903, "step": 1730 }, { "epoch": 1.4067000151584053, "grad_norm": 1.4591518605463256, "learning_rate": 1.1479726165211609e-05, "loss": 1.0133, "step": 1740 }, { "epoch": 1.4147844980041433, "grad_norm": 1.38699009818023, "learning_rate": 1.1393269007338375e-05, "loss": 1.0191, "step": 1750 }, { "epoch": 1.4228689808498813, "grad_norm": 1.4248174199771946, "learning_rate": 1.1306705510459852e-05, "loss": 1.0048, "step": 1760 }, { "epoch": 1.4309534636956192, "grad_norm": 1.5368128288739022, "learning_rate": 1.1220042281395042e-05, "loss": 1.0169, "step": 1770 }, { "epoch": 1.4390379465413572, "grad_norm": 1.620365193180215, "learning_rate": 1.1133285934574849e-05, "loss": 0.9982, "step": 1780 }, { "epoch": 1.447122429387095, "grad_norm": 1.4821421519804139, "learning_rate": 1.1046443091537232e-05, "loss": 1.0241, "step": 1790 }, { "epoch": 1.455206912232833, "grad_norm": 1.5012997646705204, "learning_rate": 1.0959520380421831e-05, "loss": 1.0116, "step": 1800 }, { "epoch": 1.463291395078571, "grad_norm": 1.4878335919543981, "learning_rate": 1.0872524435464104e-05, "loss": 0.9993, "step": 1810 }, { "epoch": 1.471375877924309, "grad_norm": 1.3918759318142178, "learning_rate": 1.0785461896488947e-05, "loss": 1.0103, "step": 1820 }, { "epoch": 1.479460360770047, "grad_norm": 1.7724767013914755, "learning_rate": 1.0698339408403944e-05, "loss": 0.9862, "step": 1830 }, { "epoch": 1.487544843615785, "grad_norm": 2.0093844914876717, "learning_rate": 1.06111636206922e-05, "loss": 1.0039, "step": 1840 }, { "epoch": 1.4956293264615228, "grad_norm": 1.4440349729006745, "learning_rate": 1.0523941186904823e-05, "loss": 1.0091, "step": 1850 }, { "epoch": 1.5037138093072608, "grad_norm": 1.5530469064140777, "learning_rate": 1.043667876415311e-05, "loss": 0.9959, "step": 1860 }, { "epoch": 1.5117982921529989, "grad_norm": 1.9710010624543786, "learning_rate": 1.0349383012600448e-05, "loss": 0.9902, "step": 1870 }, { "epoch": 1.5198827749987367, "grad_norm": 1.4874119470603941, "learning_rate": 1.0262060594954e-05, "loss": 0.9889, "step": 1880 }, { "epoch": 1.5279672578444747, "grad_norm": 1.5760932908781828, "learning_rate": 1.0174718175956164e-05, "loss": 0.997, "step": 1890 }, { "epoch": 1.5360517406902128, "grad_norm": 1.5140336706570001, "learning_rate": 1.0087362421875912e-05, "loss": 1.0162, "step": 1900 }, { "epoch": 1.5441362235359506, "grad_norm": 1.4275012742483075, "learning_rate": 1e-05, "loss": 1.0056, "step": 1910 }, { "epoch": 1.5522207063816886, "grad_norm": 1.4479646715349155, "learning_rate": 9.912637578124092e-06, "loss": 0.9831, "step": 1920 }, { "epoch": 1.5603051892274267, "grad_norm": 1.6529106306573094, "learning_rate": 9.825281824043838e-06, "loss": 1.0009, "step": 1930 }, { "epoch": 1.5683896720731645, "grad_norm": 1.4537655155385498, "learning_rate": 9.737939405046002e-06, "loss": 1.0058, "step": 1940 }, { "epoch": 1.5764741549189025, "grad_norm": 1.3881828231981752, "learning_rate": 9.650616987399553e-06, "loss": 0.9752, "step": 1950 }, { "epoch": 1.5845586377646406, "grad_norm": 1.4410127433172688, "learning_rate": 9.563321235846894e-06, "loss": 1.0026, "step": 1960 }, { "epoch": 1.5926431206103784, "grad_norm": 1.6585729752037028, "learning_rate": 9.476058813095182e-06, "loss": 0.9942, "step": 1970 }, { "epoch": 1.6007276034561164, "grad_norm": 1.6572316797520206, "learning_rate": 9.388836379307802e-06, "loss": 0.9968, "step": 1980 }, { "epoch": 1.6088120863018545, "grad_norm": 1.451151024162774, "learning_rate": 9.301660591596059e-06, "loss": 0.9921, "step": 1990 }, { "epoch": 1.6168965691475923, "grad_norm": 1.5042478185497792, "learning_rate": 9.214538103511053e-06, "loss": 0.9959, "step": 2000 }, { "epoch": 1.6249810519933303, "grad_norm": 1.4096442655309245, "learning_rate": 9.127475564535898e-06, "loss": 0.9944, "step": 2010 }, { "epoch": 1.6330655348390684, "grad_norm": 1.3701103693221475, "learning_rate": 9.04047961957817e-06, "loss": 0.9806, "step": 2020 }, { "epoch": 1.6411500176848062, "grad_norm": 1.6771886101217564, "learning_rate": 8.953556908462773e-06, "loss": 0.9986, "step": 2030 }, { "epoch": 1.6492345005305442, "grad_norm": 1.4606744478213272, "learning_rate": 8.866714065425154e-06, "loss": 0.9894, "step": 2040 }, { "epoch": 1.6573189833762823, "grad_norm": 1.5696191298486186, "learning_rate": 8.779957718604956e-06, "loss": 1.0055, "step": 2050 }, { "epoch": 1.66540346622202, "grad_norm": 1.4621439613400917, "learning_rate": 8.693294489540151e-06, "loss": 1.0055, "step": 2060 }, { "epoch": 1.673487949067758, "grad_norm": 1.4224764910826249, "learning_rate": 8.60673099266163e-06, "loss": 0.9687, "step": 2070 }, { "epoch": 1.6815724319134961, "grad_norm": 1.6938323822086323, "learning_rate": 8.520273834788395e-06, "loss": 0.978, "step": 2080 }, { "epoch": 1.689656914759234, "grad_norm": 1.5856717495753165, "learning_rate": 8.4339296146233e-06, "loss": 0.992, "step": 2090 }, { "epoch": 1.697741397604972, "grad_norm": 1.4737528022353619, "learning_rate": 8.3477049222494e-06, "loss": 0.9882, "step": 2100 }, { "epoch": 1.70582588045071, "grad_norm": 1.4413576604331515, "learning_rate": 8.261606338626998e-06, "loss": 0.9717, "step": 2110 }, { "epoch": 1.7139103632964479, "grad_norm": 1.4533604100239785, "learning_rate": 8.17564043509134e-06, "loss": 0.9878, "step": 2120 }, { "epoch": 1.7219948461421857, "grad_norm": 1.4996211527080612, "learning_rate": 8.089813772851073e-06, "loss": 0.9932, "step": 2130 }, { "epoch": 1.730079328987924, "grad_norm": 1.4183735479797297, "learning_rate": 8.004132902487499e-06, "loss": 1.0021, "step": 2140 }, { "epoch": 1.7381638118336618, "grad_norm": 1.4020103234354604, "learning_rate": 7.91860436345457e-06, "loss": 0.9717, "step": 2150 }, { "epoch": 1.7462482946793996, "grad_norm": 1.4529101522297827, "learning_rate": 7.833234683579806e-06, "loss": 0.9844, "step": 2160 }, { "epoch": 1.7543327775251378, "grad_norm": 1.4502465958251158, "learning_rate": 7.748030378566056e-06, "loss": 0.9782, "step": 2170 }, { "epoch": 1.7624172603708756, "grad_norm": 1.4461707858445054, "learning_rate": 7.662997951494193e-06, "loss": 0.9836, "step": 2180 }, { "epoch": 1.7705017432166135, "grad_norm": 1.3966480403360386, "learning_rate": 7.578143892326797e-06, "loss": 1.0089, "step": 2190 }, { "epoch": 1.7785862260623517, "grad_norm": 1.5838575969719086, "learning_rate": 7.493474677412795e-06, "loss": 1.0017, "step": 2200 }, { "epoch": 1.7866707089080895, "grad_norm": 1.6412461821364432, "learning_rate": 7.408996768993184e-06, "loss": 0.9889, "step": 2210 }, { "epoch": 1.7947551917538274, "grad_norm": 1.8686882471940454, "learning_rate": 7.324716614707794e-06, "loss": 0.9814, "step": 2220 }, { "epoch": 1.8028396745995656, "grad_norm": 1.4444454657231485, "learning_rate": 7.240640647103192e-06, "loss": 0.9934, "step": 2230 }, { "epoch": 1.8109241574453034, "grad_norm": 1.5880994051473134, "learning_rate": 7.156775283141733e-06, "loss": 0.9972, "step": 2240 }, { "epoch": 1.8190086402910413, "grad_norm": 1.6179768250952558, "learning_rate": 7.0731269237117775e-06, "loss": 0.9805, "step": 2250 }, { "epoch": 1.8270931231367793, "grad_norm": 1.4161571668846493, "learning_rate": 6.989701953139181e-06, "loss": 0.9695, "step": 2260 }, { "epoch": 1.8351776059825173, "grad_norm": 1.8752619329260358, "learning_rate": 6.906506738699994e-06, "loss": 0.9899, "step": 2270 }, { "epoch": 1.8432620888282552, "grad_norm": 1.8476640791436918, "learning_rate": 6.823547630134497e-06, "loss": 0.9799, "step": 2280 }, { "epoch": 1.8513465716739932, "grad_norm": 1.5003229948984453, "learning_rate": 6.740830959162592e-06, "loss": 0.9948, "step": 2290 }, { "epoch": 1.8594310545197312, "grad_norm": 1.4363919724793655, "learning_rate": 6.658363039000501e-06, "loss": 0.9625, "step": 2300 }, { "epoch": 1.867515537365469, "grad_norm": 1.45857815520064, "learning_rate": 6.57615016387896e-06, "loss": 0.976, "step": 2310 }, { "epoch": 1.875600020211207, "grad_norm": 1.3637017381911254, "learning_rate": 6.4941986085627895e-06, "loss": 0.9608, "step": 2320 }, { "epoch": 1.8836845030569451, "grad_norm": 1.586134857640991, "learning_rate": 6.412514627872003e-06, "loss": 0.9702, "step": 2330 }, { "epoch": 1.891768985902683, "grad_norm": 1.6293874205755696, "learning_rate": 6.331104456204423e-06, "loss": 0.9672, "step": 2340 }, { "epoch": 1.899853468748421, "grad_norm": 1.6185456719315228, "learning_rate": 6.249974307059826e-06, "loss": 0.9683, "step": 2350 }, { "epoch": 1.907937951594159, "grad_norm": 1.5897776438113254, "learning_rate": 6.169130372565737e-06, "loss": 0.9942, "step": 2360 }, { "epoch": 1.9160224344398968, "grad_norm": 1.4621464766459995, "learning_rate": 6.088578823004796e-06, "loss": 0.9552, "step": 2370 }, { "epoch": 1.9241069172856349, "grad_norm": 1.57419066036152, "learning_rate": 6.008325806343842e-06, "loss": 0.9635, "step": 2380 }, { "epoch": 1.932191400131373, "grad_norm": 1.4154240767952921, "learning_rate": 5.9283774477646775e-06, "loss": 0.9661, "step": 2390 }, { "epoch": 1.9402758829771107, "grad_norm": 1.4089774352311322, "learning_rate": 5.848739849196556e-06, "loss": 0.9623, "step": 2400 }, { "epoch": 1.9483603658228488, "grad_norm": 1.4330997113061938, "learning_rate": 5.7694190888504964e-06, "loss": 0.982, "step": 2410 }, { "epoch": 1.9564448486685868, "grad_norm": 1.762833270995275, "learning_rate": 5.690421220755329e-06, "loss": 0.968, "step": 2420 }, { "epoch": 1.9645293315143246, "grad_norm": 1.57370551896378, "learning_rate": 5.611752274295665e-06, "loss": 0.9639, "step": 2430 }, { "epoch": 1.9726138143600627, "grad_norm": 1.4682932578058885, "learning_rate": 5.533418253751714e-06, "loss": 0.9786, "step": 2440 }, { "epoch": 1.9806982972058007, "grad_norm": 1.7633821953728437, "learning_rate": 5.455425137840987e-06, "loss": 0.9618, "step": 2450 }, { "epoch": 1.9887827800515385, "grad_norm": 1.5018261369656176, "learning_rate": 5.377778879262017e-06, "loss": 0.9454, "step": 2460 }, { "epoch": 1.9968672628972766, "grad_norm": 1.5404280086355402, "learning_rate": 5.300485404239999e-06, "loss": 0.9628, "step": 2470 }, { "epoch": 1.999292607750998, "eval_loss": 0.8751075863838196, "eval_runtime": 481.67, "eval_samples_per_second": 25.254, "eval_steps_per_second": 12.627, "step": 2473 }, { "epoch": 2.0049517457430146, "grad_norm": 1.8577507088673693, "learning_rate": 5.223550612074497e-06, "loss": 0.8752, "step": 2480 }, { "epoch": 2.0130362285887524, "grad_norm": 1.5570324756102374, "learning_rate": 5.146980374689192e-06, "loss": 0.8398, "step": 2490 }, { "epoch": 2.0211207114344902, "grad_norm": 1.645225536576169, "learning_rate": 5.070780536183698e-06, "loss": 0.856, "step": 2500 }, { "epoch": 2.0292051942802285, "grad_norm": 1.6698633554870226, "learning_rate": 4.99495691238755e-06, "loss": 0.8365, "step": 2510 }, { "epoch": 2.0372896771259663, "grad_norm": 2.010967933907663, "learning_rate": 4.9195152904162865e-06, "loss": 0.8308, "step": 2520 }, { "epoch": 2.045374159971704, "grad_norm": 1.4592026658551123, "learning_rate": 4.844461428229782e-06, "loss": 0.8387, "step": 2530 }, { "epoch": 2.0534586428174424, "grad_norm": 1.9716723547932462, "learning_rate": 4.769801054192776e-06, "loss": 0.8374, "step": 2540 }, { "epoch": 2.06154312566318, "grad_norm": 1.6334367414667887, "learning_rate": 4.695539866637653e-06, "loss": 0.8587, "step": 2550 }, { "epoch": 2.069627608508918, "grad_norm": 1.713926689166813, "learning_rate": 4.6216835334295385e-06, "loss": 0.8376, "step": 2560 }, { "epoch": 2.0777120913546563, "grad_norm": 1.5714175555320091, "learning_rate": 4.548237691533699e-06, "loss": 0.8346, "step": 2570 }, { "epoch": 2.085796574200394, "grad_norm": 1.4811489223457255, "learning_rate": 4.475207946585328e-06, "loss": 0.8473, "step": 2580 }, { "epoch": 2.093881057046132, "grad_norm": 1.4400201402098334, "learning_rate": 4.402599872461678e-06, "loss": 0.8309, "step": 2590 }, { "epoch": 2.10196553989187, "grad_norm": 1.5527150219002093, "learning_rate": 4.330419010856661e-06, "loss": 0.8312, "step": 2600 }, { "epoch": 2.110050022737608, "grad_norm": 1.4540137626455856, "learning_rate": 4.258670870857894e-06, "loss": 0.8461, "step": 2610 }, { "epoch": 2.118134505583346, "grad_norm": 1.5200526871374724, "learning_rate": 4.187360928526198e-06, "loss": 0.8353, "step": 2620 }, { "epoch": 2.126218988429084, "grad_norm": 1.487656190760893, "learning_rate": 4.116494626477684e-06, "loss": 0.842, "step": 2630 }, { "epoch": 2.134303471274822, "grad_norm": 1.4541876796717628, "learning_rate": 4.046077373468325e-06, "loss": 0.8285, "step": 2640 }, { "epoch": 2.1423879541205597, "grad_norm": 1.515080712913025, "learning_rate": 3.976114543981148e-06, "loss": 0.8278, "step": 2650 }, { "epoch": 2.150472436966298, "grad_norm": 1.5925627792233104, "learning_rate": 3.906611477816054e-06, "loss": 0.8382, "step": 2660 }, { "epoch": 2.158556919812036, "grad_norm": 1.4749306746231339, "learning_rate": 3.837573479682236e-06, "loss": 0.8453, "step": 2670 }, { "epoch": 2.1666414026577736, "grad_norm": 1.888042329530717, "learning_rate": 3.769005818793329e-06, "loss": 0.854, "step": 2680 }, { "epoch": 2.174725885503512, "grad_norm": 1.598037794600047, "learning_rate": 3.7009137284652386e-06, "loss": 0.8519, "step": 2690 }, { "epoch": 2.1828103683492497, "grad_norm": 1.5540837615094885, "learning_rate": 3.633302405716712e-06, "loss": 0.8397, "step": 2700 }, { "epoch": 2.1908948511949875, "grad_norm": 1.430485289060877, "learning_rate": 3.5661770108726914e-06, "loss": 0.8271, "step": 2710 }, { "epoch": 2.1989793340407258, "grad_norm": 2.401835949374892, "learning_rate": 3.4995426671704493e-06, "loss": 0.8335, "step": 2720 }, { "epoch": 2.2070638168864636, "grad_norm": 1.506353292247366, "learning_rate": 3.433404460368587e-06, "loss": 0.828, "step": 2730 }, { "epoch": 2.2151482997322014, "grad_norm": 1.4406717845115946, "learning_rate": 3.3677674383588476e-06, "loss": 0.8315, "step": 2740 }, { "epoch": 2.2232327825779397, "grad_norm": 1.5393945850323205, "learning_rate": 3.302636610780855e-06, "loss": 0.8504, "step": 2750 }, { "epoch": 2.2313172654236775, "grad_norm": 1.7257558230682333, "learning_rate": 3.238016948639772e-06, "loss": 0.8232, "step": 2760 }, { "epoch": 2.2394017482694153, "grad_norm": 1.8326756661400847, "learning_rate": 3.1739133839268698e-06, "loss": 0.8154, "step": 2770 }, { "epoch": 2.2474862311151536, "grad_norm": 1.5269518503128512, "learning_rate": 3.110330809243134e-06, "loss": 0.8317, "step": 2780 }, { "epoch": 2.2555707139608914, "grad_norm": 1.504166909878008, "learning_rate": 3.0472740774258157e-06, "loss": 0.8368, "step": 2790 }, { "epoch": 2.263655196806629, "grad_norm": 1.480047137104623, "learning_rate": 2.9847480011780607e-06, "loss": 0.8409, "step": 2800 }, { "epoch": 2.2717396796523674, "grad_norm": 1.492023552078346, "learning_rate": 2.922757352701595e-06, "loss": 0.8243, "step": 2810 }, { "epoch": 2.2798241624981053, "grad_norm": 1.467055149697424, "learning_rate": 2.861306863332475e-06, "loss": 0.8289, "step": 2820 }, { "epoch": 2.287908645343843, "grad_norm": 1.504514345406056, "learning_rate": 2.8004012231799905e-06, "loss": 0.8375, "step": 2830 }, { "epoch": 2.295993128189581, "grad_norm": 1.5091792435489357, "learning_rate": 2.740045080768694e-06, "loss": 0.8233, "step": 2840 }, { "epoch": 2.304077611035319, "grad_norm": 1.4619080284602382, "learning_rate": 2.6802430426836113e-06, "loss": 0.8356, "step": 2850 }, { "epoch": 2.312162093881057, "grad_norm": 1.4085751552174153, "learning_rate": 2.620999673218656e-06, "loss": 0.8156, "step": 2860 }, { "epoch": 2.3202465767267952, "grad_norm": 1.4755258769825808, "learning_rate": 2.5623194940282526e-06, "loss": 0.8353, "step": 2870 }, { "epoch": 2.328331059572533, "grad_norm": 1.5852343601430656, "learning_rate": 2.504206983782248e-06, "loss": 0.8133, "step": 2880 }, { "epoch": 2.336415542418271, "grad_norm": 1.4903107631764194, "learning_rate": 2.446666577824068e-06, "loss": 0.8459, "step": 2890 }, { "epoch": 2.3445000252640087, "grad_norm": 1.523719484539125, "learning_rate": 2.389702667832202e-06, "loss": 0.8285, "step": 2900 }, { "epoch": 2.352584508109747, "grad_norm": 1.457321496284554, "learning_rate": 2.3333196014850246e-06, "loss": 0.8304, "step": 2910 }, { "epoch": 2.3606689909554848, "grad_norm": 1.537434676857527, "learning_rate": 2.277521682128947e-06, "loss": 0.829, "step": 2920 }, { "epoch": 2.3687534738012226, "grad_norm": 1.4707817420987006, "learning_rate": 2.2223131684499932e-06, "loss": 0.8372, "step": 2930 }, { "epoch": 2.376837956646961, "grad_norm": 1.46749047915079, "learning_rate": 2.1676982741487427e-06, "loss": 0.8222, "step": 2940 }, { "epoch": 2.3849224394926987, "grad_norm": 1.518122852634397, "learning_rate": 2.113681167618736e-06, "loss": 0.8401, "step": 2950 }, { "epoch": 2.3930069223384365, "grad_norm": 1.8575848589445734, "learning_rate": 2.060265971628338e-06, "loss": 0.8339, "step": 2960 }, { "epoch": 2.4010914051841747, "grad_norm": 1.5601145654381285, "learning_rate": 2.0074567630060514e-06, "loss": 0.8154, "step": 2970 }, { "epoch": 2.4091758880299126, "grad_norm": 1.530898387002521, "learning_rate": 1.955257572329379e-06, "loss": 0.823, "step": 2980 }, { "epoch": 2.4172603708756504, "grad_norm": 1.6224545445427798, "learning_rate": 1.9036723836171899e-06, "loss": 0.8145, "step": 2990 }, { "epoch": 2.4253448537213886, "grad_norm": 1.4013679708594033, "learning_rate": 1.8527051340256397e-06, "loss": 0.8215, "step": 3000 }, { "epoch": 2.4334293365671265, "grad_norm": 1.5692785609667004, "learning_rate": 1.8023597135476923e-06, "loss": 0.8241, "step": 3010 }, { "epoch": 2.4415138194128643, "grad_norm": 1.5126974695662643, "learning_rate": 1.752639964716193e-06, "loss": 0.8421, "step": 3020 }, { "epoch": 2.4495983022586025, "grad_norm": 1.6242742569822604, "learning_rate": 1.7035496823106247e-06, "loss": 0.8141, "step": 3030 }, { "epoch": 2.4576827851043404, "grad_norm": 1.4628790110692993, "learning_rate": 1.6550926130674527e-06, "loss": 0.8184, "step": 3040 }, { "epoch": 2.465767267950078, "grad_norm": 1.4807837431822446, "learning_rate": 1.607272455394172e-06, "loss": 0.8202, "step": 3050 }, { "epoch": 2.4738517507958164, "grad_norm": 1.5539937903441552, "learning_rate": 1.5600928590870402e-06, "loss": 0.8391, "step": 3060 }, { "epoch": 2.4819362336415542, "grad_norm": 1.6677495360703212, "learning_rate": 1.5135574250524898e-06, "loss": 0.8436, "step": 3070 }, { "epoch": 2.490020716487292, "grad_norm": 1.53769857798961, "learning_rate": 1.467669705032323e-06, "loss": 0.8263, "step": 3080 }, { "epoch": 2.4981051993330303, "grad_norm": 1.4732928239069325, "learning_rate": 1.422433201332607e-06, "loss": 0.8284, "step": 3090 }, { "epoch": 2.506189682178768, "grad_norm": 1.5928757648188723, "learning_rate": 1.3778513665563786e-06, "loss": 0.8319, "step": 3100 }, { "epoch": 2.514274165024506, "grad_norm": 1.4230928346180836, "learning_rate": 1.3339276033401283e-06, "loss": 0.8052, "step": 3110 }, { "epoch": 2.522358647870244, "grad_norm": 1.4772661299744003, "learning_rate": 1.290665264094093e-06, "loss": 0.8241, "step": 3120 }, { "epoch": 2.530443130715982, "grad_norm": 1.522091825661006, "learning_rate": 1.2480676507464051e-06, "loss": 0.8106, "step": 3130 }, { "epoch": 2.53852761356172, "grad_norm": 1.525599170654266, "learning_rate": 1.2061380144910572e-06, "loss": 0.8166, "step": 3140 }, { "epoch": 2.5466120964074577, "grad_norm": 1.4929327017491605, "learning_rate": 1.1648795555397719e-06, "loss": 0.8251, "step": 3150 }, { "epoch": 2.554696579253196, "grad_norm": 1.5920001415947864, "learning_rate": 1.1242954228777513e-06, "loss": 0.8268, "step": 3160 }, { "epoch": 2.5627810620989337, "grad_norm": 1.5252651359986042, "learning_rate": 1.08438871402333e-06, "loss": 0.831, "step": 3170 }, { "epoch": 2.570865544944672, "grad_norm": 1.6461347768103347, "learning_rate": 1.04516247479157e-06, "loss": 0.8239, "step": 3180 }, { "epoch": 2.57895002779041, "grad_norm": 1.490863354097273, "learning_rate": 1.006619699061785e-06, "loss": 0.823, "step": 3190 }, { "epoch": 2.5870345106361476, "grad_norm": 1.5158841203253022, "learning_rate": 9.687633285490395e-07, "loss": 0.8333, "step": 3200 }, { "epoch": 2.5951189934818855, "grad_norm": 1.4861408651974157, "learning_rate": 9.315962525796374e-07, "loss": 0.8178, "step": 3210 }, { "epoch": 2.6032034763276237, "grad_norm": 1.4847726389856295, "learning_rate": 8.951213078705811e-07, "loss": 0.8244, "step": 3220 }, { "epoch": 2.6112879591733615, "grad_norm": 1.4579228976188288, "learning_rate": 8.593412783130805e-07, "loss": 0.8116, "step": 3230 }, { "epoch": 2.6193724420191, "grad_norm": 1.4309284818257009, "learning_rate": 8.24258894760066e-07, "loss": 0.8233, "step": 3240 }, { "epoch": 2.6274569248648376, "grad_norm": 1.481662266621092, "learning_rate": 7.898768348177643e-07, "loss": 0.8393, "step": 3250 }, { "epoch": 2.6355414077105754, "grad_norm": 1.42582017885812, "learning_rate": 7.561977226413341e-07, "loss": 0.8344, "step": 3260 }, { "epoch": 2.6436258905563133, "grad_norm": 1.4203791210214531, "learning_rate": 7.23224128734582e-07, "loss": 0.821, "step": 3270 }, { "epoch": 2.6517103734020515, "grad_norm": 1.4780417621137758, "learning_rate": 6.909585697537758e-07, "loss": 0.8353, "step": 3280 }, { "epoch": 2.6597948562477893, "grad_norm": 1.4466612391449976, "learning_rate": 6.594035083155581e-07, "loss": 0.8268, "step": 3290 }, { "epoch": 2.6678793390935276, "grad_norm": 1.4584592752103582, "learning_rate": 6.285613528089962e-07, "loss": 0.8164, "step": 3300 }, { "epoch": 2.6759638219392654, "grad_norm": 1.487514724946772, "learning_rate": 5.98434457211765e-07, "loss": 0.8027, "step": 3310 }, { "epoch": 2.6840483047850032, "grad_norm": 1.4294666752405771, "learning_rate": 5.690251209104802e-07, "loss": 0.8105, "step": 3320 }, { "epoch": 2.692132787630741, "grad_norm": 1.4638925402226952, "learning_rate": 5.403355885252104e-07, "loss": 0.8135, "step": 3330 }, { "epoch": 2.7002172704764793, "grad_norm": 1.4458763488108235, "learning_rate": 5.123680497381444e-07, "loss": 0.8102, "step": 3340 }, { "epoch": 2.708301753322217, "grad_norm": 1.4903596037049076, "learning_rate": 4.851246391264819e-07, "loss": 0.8152, "step": 3350 }, { "epoch": 2.7163862361679554, "grad_norm": 1.4429528216246368, "learning_rate": 4.5860743599951186e-07, "loss": 0.8121, "step": 3360 }, { "epoch": 2.724470719013693, "grad_norm": 1.452035259914063, "learning_rate": 4.328184642399036e-07, "loss": 0.821, "step": 3370 }, { "epoch": 2.732555201859431, "grad_norm": 1.5303877229228735, "learning_rate": 4.077596921492533e-07, "loss": 0.8145, "step": 3380 }, { "epoch": 2.740639684705169, "grad_norm": 1.4449405328561624, "learning_rate": 3.834330322978397e-07, "loss": 0.8214, "step": 3390 }, { "epoch": 2.748724167550907, "grad_norm": 1.4371584227135465, "learning_rate": 3.598403413786611e-07, "loss": 0.8131, "step": 3400 }, { "epoch": 2.756808650396645, "grad_norm": 1.4632980675092546, "learning_rate": 3.3698342006572294e-07, "loss": 0.8244, "step": 3410 }, { "epoch": 2.764893133242383, "grad_norm": 1.4500755832832954, "learning_rate": 3.148640128766056e-07, "loss": 0.823, "step": 3420 }, { "epoch": 2.772977616088121, "grad_norm": 1.4751477866660623, "learning_rate": 2.934838080393154e-07, "loss": 0.8211, "step": 3430 }, { "epoch": 2.781062098933859, "grad_norm": 1.4653755137740456, "learning_rate": 2.7284443736343203e-07, "loss": 0.8024, "step": 3440 }, { "epoch": 2.7891465817795966, "grad_norm": 1.4089563044736344, "learning_rate": 2.52947476115567e-07, "loss": 0.8228, "step": 3450 }, { "epoch": 2.797231064625335, "grad_norm": 1.460696621649454, "learning_rate": 2.3379444289913344e-07, "loss": 0.8184, "step": 3460 }, { "epoch": 2.8053155474710727, "grad_norm": 1.4693334824298931, "learning_rate": 2.153867995384351e-07, "loss": 0.8224, "step": 3470 }, { "epoch": 2.8134000303168105, "grad_norm": 1.4469954005038157, "learning_rate": 1.9772595096710477e-07, "loss": 0.8373, "step": 3480 }, { "epoch": 2.821484513162549, "grad_norm": 1.4331150676229163, "learning_rate": 1.8081324512086663e-07, "loss": 0.8185, "step": 3490 }, { "epoch": 2.8295689960082866, "grad_norm": 1.5335384382024873, "learning_rate": 1.6464997283466067e-07, "loss": 0.8124, "step": 3500 }, { "epoch": 2.8376534788540244, "grad_norm": 1.4445147972537609, "learning_rate": 1.492373677441228e-07, "loss": 0.8145, "step": 3510 }, { "epoch": 2.8457379616997627, "grad_norm": 1.4976188260457166, "learning_rate": 1.3457660619142887e-07, "loss": 0.8163, "step": 3520 }, { "epoch": 2.8538224445455005, "grad_norm": 1.439452743377751, "learning_rate": 1.2066880713550888e-07, "loss": 0.829, "step": 3530 }, { "epoch": 2.8619069273912383, "grad_norm": 1.524984754735583, "learning_rate": 1.0751503206665071e-07, "loss": 0.8236, "step": 3540 }, { "epoch": 2.8699914102369766, "grad_norm": 1.448229914768272, "learning_rate": 9.511628492547609e-08, "loss": 0.8223, "step": 3550 }, { "epoch": 2.8780758930827144, "grad_norm": 1.4915344957228824, "learning_rate": 8.347351202632525e-08, "loss": 0.843, "step": 3560 }, { "epoch": 2.886160375928452, "grad_norm": 1.4891660841319714, "learning_rate": 7.258760198502246e-08, "loss": 0.8173, "step": 3570 }, { "epoch": 2.89424485877419, "grad_norm": 1.4485487573496472, "learning_rate": 6.245938565105803e-08, "loss": 0.8299, "step": 3580 }, { "epoch": 2.9023293416199283, "grad_norm": 1.452602418516034, "learning_rate": 5.308963604417572e-08, "loss": 0.8216, "step": 3590 }, { "epoch": 2.910413824465666, "grad_norm": 1.4554407329371093, "learning_rate": 4.447906829537219e-08, "loss": 0.8284, "step": 3600 }, { "epoch": 2.9184983073114044, "grad_norm": 1.4918607001029844, "learning_rate": 3.6628339592313935e-08, "loss": 0.8012, "step": 3610 }, { "epoch": 2.926582790157142, "grad_norm": 1.4229324193215207, "learning_rate": 2.95380491291819e-08, "loss": 0.8401, "step": 3620 }, { "epoch": 2.93466727300288, "grad_norm": 1.4288366788035922, "learning_rate": 2.320873806093804e-08, "loss": 0.8228, "step": 3630 }, { "epoch": 2.942751755848618, "grad_norm": 1.4724134547959333, "learning_rate": 1.764088946201947e-08, "loss": 0.8064, "step": 3640 }, { "epoch": 2.950836238694356, "grad_norm": 1.4984479737935563, "learning_rate": 1.2834928289472415e-08, "loss": 0.81, "step": 3650 }, { "epoch": 2.958920721540094, "grad_norm": 1.4666816312445612, "learning_rate": 8.79122135051591e-09, "loss": 0.822, "step": 3660 }, { "epoch": 2.967005204385832, "grad_norm": 1.445201429621803, "learning_rate": 5.510077274547554e-09, "loss": 0.8271, "step": 3670 }, { "epoch": 2.97508968723157, "grad_norm": 1.4460059967392547, "learning_rate": 2.9917464895856673e-09, "loss": 0.8389, "step": 3680 }, { "epoch": 2.983174170077308, "grad_norm": 1.435390156627942, "learning_rate": 1.2364212031579226e-09, "loss": 0.8294, "step": 3690 }, { "epoch": 2.9912586529230456, "grad_norm": 1.5066669703721747, "learning_rate": 2.442353876297432e-10, "loss": 0.801, "step": 3700 }, { "epoch": 2.997726239199636, "eval_loss": 0.8224219083786011, "eval_runtime": 474.463, "eval_samples_per_second": 25.637, "eval_steps_per_second": 12.819, "step": 3708 }, { "epoch": 2.997726239199636, "step": 3708, "total_flos": 0.0, "train_loss": 1.0273753281164324, "train_runtime": 58675.1239, "train_samples_per_second": 8.095, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 3708, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }