diff --git "a/checkpoint-1695/trainer_state.json" "b/checkpoint-1695/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1695/trainer_state.json" @@ -0,0 +1,11982 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9986731534719153, + "eval_steps": 142, + "global_step": 1695, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.680586814880371, + "learning_rate": 1e-05, + "loss": 3.3182, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 3.3362529277801514, + "eval_runtime": 14.4477, + "eval_samples_per_second": 33.016, + "eval_steps_per_second": 8.306, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.609802722930908, + "learning_rate": 2e-05, + "loss": 3.2788, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 4.793943405151367, + "learning_rate": 3e-05, + "loss": 3.3432, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 4.687256336212158, + "learning_rate": 4e-05, + "loss": 3.2521, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 4.676945209503174, + "learning_rate": 5e-05, + "loss": 3.1085, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 4.490086078643799, + "learning_rate": 6e-05, + "loss": 2.8093, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 4.042544364929199, + "learning_rate": 7e-05, + "loss": 2.3501, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 3.4973549842834473, + "learning_rate": 8e-05, + "loss": 1.6118, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 2.9255049228668213, + "learning_rate": 9e-05, + "loss": 0.9938, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 1.9458708763122559, + "learning_rate": 0.0001, + "loss": 0.4821, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 1.3317277431488037, + "learning_rate": 9.999991309598974e-05, + "loss": 0.3336, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.7918155193328857, + "learning_rate": 9.999965238426104e-05, + "loss": 0.1707, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.779201090335846, + "learning_rate": 9.999921786572015e-05, + "loss": 0.1089, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 1.8333582878112793, + "learning_rate": 9.999860954187756e-05, + "loss": 0.1829, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 0.7508969902992249, + "learning_rate": 9.999782741484788e-05, + "loss": 0.1284, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 0.6024438142776489, + "learning_rate": 9.999687148734995e-05, + "loss": 0.1321, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.2141278237104416, + "learning_rate": 9.999574176270667e-05, + "loss": 0.1294, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 0.8326414227485657, + "learning_rate": 9.999443824484519e-05, + "loss": 0.1414, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.25254812836647034, + "learning_rate": 9.999296093829672e-05, + "loss": 0.1389, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 0.24321849644184113, + "learning_rate": 9.999130984819662e-05, + "loss": 0.1354, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 0.27592459321022034, + "learning_rate": 9.998948498028435e-05, + "loss": 0.1032, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.5619893670082092, + "learning_rate": 9.998748634090344e-05, + "loss": 0.1264, + "step": 22 + }, + { + "epoch": 0.04, + "grad_norm": 0.7668437361717224, + "learning_rate": 9.998531393700148e-05, + "loss": 0.1223, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.38381776213645935, + "learning_rate": 9.99829677761301e-05, + "loss": 0.0988, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 1.8528074026107788, + "learning_rate": 9.998044786644491e-05, + "loss": 0.1421, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 2.61940336227417, + "learning_rate": 9.997775421670556e-05, + "loss": 0.2738, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 0.5642948150634766, + "learning_rate": 9.997488683627559e-05, + "loss": 0.1113, + "step": 27 + }, + { + "epoch": 0.05, + "grad_norm": 0.31713128089904785, + "learning_rate": 9.997184573512245e-05, + "loss": 0.0593, + "step": 28 + }, + { + "epoch": 0.05, + "grad_norm": 1.540770411491394, + "learning_rate": 9.996863092381752e-05, + "loss": 0.2008, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.5343081951141357, + "learning_rate": 9.9965242413536e-05, + "loss": 0.1141, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 0.3453178107738495, + "learning_rate": 9.99616802160569e-05, + "loss": 0.1137, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 0.24307988584041595, + "learning_rate": 9.995794434376297e-05, + "loss": 0.0971, + "step": 32 + }, + { + "epoch": 0.06, + "grad_norm": 0.1228247806429863, + "learning_rate": 9.995403480964072e-05, + "loss": 0.1246, + "step": 33 + }, + { + "epoch": 0.06, + "grad_norm": 0.28919702768325806, + "learning_rate": 9.994995162728029e-05, + "loss": 0.1485, + "step": 34 + }, + { + "epoch": 0.06, + "grad_norm": 0.16266010701656342, + "learning_rate": 9.994569481087552e-05, + "loss": 0.1196, + "step": 35 + }, + { + "epoch": 0.06, + "grad_norm": 0.13942277431488037, + "learning_rate": 9.994126437522375e-05, + "loss": 0.1266, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 0.22137008607387543, + "learning_rate": 9.99366603357259e-05, + "loss": 0.1226, + "step": 37 + }, + { + "epoch": 0.07, + "grad_norm": 0.4418635070323944, + "learning_rate": 9.993188270838635e-05, + "loss": 0.1577, + "step": 38 + }, + { + "epoch": 0.07, + "grad_norm": 0.1568412482738495, + "learning_rate": 9.992693150981292e-05, + "loss": 0.1205, + "step": 39 + }, + { + "epoch": 0.07, + "grad_norm": 0.33117881417274475, + "learning_rate": 9.992180675721672e-05, + "loss": 0.1179, + "step": 40 + }, + { + "epoch": 0.07, + "grad_norm": 0.18135391175746918, + "learning_rate": 9.991650846841226e-05, + "loss": 0.098, + "step": 41 + }, + { + "epoch": 0.07, + "grad_norm": 0.23688584566116333, + "learning_rate": 9.99110366618172e-05, + "loss": 0.0839, + "step": 42 + }, + { + "epoch": 0.08, + "grad_norm": 0.40056225657463074, + "learning_rate": 9.990539135645245e-05, + "loss": 0.1648, + "step": 43 + }, + { + "epoch": 0.08, + "grad_norm": 0.16901437938213348, + "learning_rate": 9.9899572571942e-05, + "loss": 0.052, + "step": 44 + }, + { + "epoch": 0.08, + "grad_norm": 0.6529514193534851, + "learning_rate": 9.989358032851284e-05, + "loss": 0.1448, + "step": 45 + }, + { + "epoch": 0.08, + "grad_norm": 0.25050362944602966, + "learning_rate": 9.9887414646995e-05, + "loss": 0.083, + "step": 46 + }, + { + "epoch": 0.08, + "grad_norm": 0.38230955600738525, + "learning_rate": 9.988107554882138e-05, + "loss": 0.0912, + "step": 47 + }, + { + "epoch": 0.08, + "grad_norm": 0.24738825857639313, + "learning_rate": 9.987456305602769e-05, + "loss": 0.1337, + "step": 48 + }, + { + "epoch": 0.09, + "grad_norm": 0.22692906856536865, + "learning_rate": 9.986787719125241e-05, + "loss": 0.0924, + "step": 49 + }, + { + "epoch": 0.09, + "grad_norm": 0.6348592638969421, + "learning_rate": 9.986101797773667e-05, + "loss": 0.1401, + "step": 50 + }, + { + "epoch": 0.09, + "grad_norm": 0.27844250202178955, + "learning_rate": 9.985398543932421e-05, + "loss": 0.1028, + "step": 51 + }, + { + "epoch": 0.09, + "grad_norm": 0.4353552460670471, + "learning_rate": 9.984677960046123e-05, + "loss": 0.1245, + "step": 52 + }, + { + "epoch": 0.09, + "grad_norm": 0.3049747943878174, + "learning_rate": 9.98394004861964e-05, + "loss": 0.0936, + "step": 53 + }, + { + "epoch": 0.1, + "grad_norm": 0.34531188011169434, + "learning_rate": 9.983184812218072e-05, + "loss": 0.0775, + "step": 54 + }, + { + "epoch": 0.1, + "grad_norm": 0.252056360244751, + "learning_rate": 9.98241225346674e-05, + "loss": 0.1082, + "step": 55 + }, + { + "epoch": 0.1, + "grad_norm": 0.14545601606369019, + "learning_rate": 9.981622375051183e-05, + "loss": 0.0857, + "step": 56 + }, + { + "epoch": 0.1, + "grad_norm": 0.313376784324646, + "learning_rate": 9.980815179717145e-05, + "loss": 0.0798, + "step": 57 + }, + { + "epoch": 0.1, + "grad_norm": 0.3164367079734802, + "learning_rate": 9.979990670270564e-05, + "loss": 0.1103, + "step": 58 + }, + { + "epoch": 0.1, + "grad_norm": 0.30412447452545166, + "learning_rate": 9.979148849577572e-05, + "loss": 0.0889, + "step": 59 + }, + { + "epoch": 0.11, + "grad_norm": 0.2582318186759949, + "learning_rate": 9.978289720564471e-05, + "loss": 0.0844, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 0.2912735939025879, + "learning_rate": 9.977413286217728e-05, + "loss": 0.077, + "step": 61 + }, + { + "epoch": 0.11, + "grad_norm": 0.3888057768344879, + "learning_rate": 9.976519549583974e-05, + "loss": 0.1386, + "step": 62 + }, + { + "epoch": 0.11, + "grad_norm": 0.42121654748916626, + "learning_rate": 9.975608513769976e-05, + "loss": 0.0823, + "step": 63 + }, + { + "epoch": 0.11, + "grad_norm": 0.4052259624004364, + "learning_rate": 9.974680181942645e-05, + "loss": 0.0846, + "step": 64 + }, + { + "epoch": 0.11, + "grad_norm": 0.2273682802915573, + "learning_rate": 9.973734557329009e-05, + "loss": 0.0589, + "step": 65 + }, + { + "epoch": 0.12, + "grad_norm": 0.5051669478416443, + "learning_rate": 9.972771643216212e-05, + "loss": 0.1111, + "step": 66 + }, + { + "epoch": 0.12, + "grad_norm": 0.2672370970249176, + "learning_rate": 9.971791442951497e-05, + "loss": 0.0819, + "step": 67 + }, + { + "epoch": 0.12, + "grad_norm": 0.28557881712913513, + "learning_rate": 9.970793959942198e-05, + "loss": 0.0912, + "step": 68 + }, + { + "epoch": 0.12, + "grad_norm": 0.34148702025413513, + "learning_rate": 9.969779197655726e-05, + "loss": 0.1036, + "step": 69 + }, + { + "epoch": 0.12, + "grad_norm": 0.3550778925418854, + "learning_rate": 9.968747159619556e-05, + "loss": 0.0833, + "step": 70 + }, + { + "epoch": 0.13, + "grad_norm": 0.3434258699417114, + "learning_rate": 9.967697849421221e-05, + "loss": 0.1186, + "step": 71 + }, + { + "epoch": 0.13, + "grad_norm": 0.23545867204666138, + "learning_rate": 9.966631270708287e-05, + "loss": 0.1185, + "step": 72 + }, + { + "epoch": 0.13, + "grad_norm": 0.28094542026519775, + "learning_rate": 9.965547427188357e-05, + "loss": 0.052, + "step": 73 + }, + { + "epoch": 0.13, + "grad_norm": 0.13517600297927856, + "learning_rate": 9.964446322629043e-05, + "loss": 0.0695, + "step": 74 + }, + { + "epoch": 0.13, + "grad_norm": 0.16696467995643616, + "learning_rate": 9.963327960857962e-05, + "loss": 0.1003, + "step": 75 + }, + { + "epoch": 0.13, + "grad_norm": 0.18569788336753845, + "learning_rate": 9.962192345762717e-05, + "loss": 0.0495, + "step": 76 + }, + { + "epoch": 0.14, + "grad_norm": 0.19817449152469635, + "learning_rate": 9.961039481290888e-05, + "loss": 0.067, + "step": 77 + }, + { + "epoch": 0.14, + "grad_norm": 0.14672966301441193, + "learning_rate": 9.959869371450021e-05, + "loss": 0.0737, + "step": 78 + }, + { + "epoch": 0.14, + "grad_norm": 0.302121639251709, + "learning_rate": 9.958682020307601e-05, + "loss": 0.0779, + "step": 79 + }, + { + "epoch": 0.14, + "grad_norm": 0.35970303416252136, + "learning_rate": 9.957477431991054e-05, + "loss": 0.134, + "step": 80 + }, + { + "epoch": 0.14, + "grad_norm": 0.308292955160141, + "learning_rate": 9.956255610687719e-05, + "loss": 0.1006, + "step": 81 + }, + { + "epoch": 0.15, + "grad_norm": 0.27124735713005066, + "learning_rate": 9.955016560644847e-05, + "loss": 0.0572, + "step": 82 + }, + { + "epoch": 0.15, + "grad_norm": 0.1820615977048874, + "learning_rate": 9.953760286169571e-05, + "loss": 0.0595, + "step": 83 + }, + { + "epoch": 0.15, + "grad_norm": 0.36385104060173035, + "learning_rate": 9.952486791628905e-05, + "loss": 0.0874, + "step": 84 + }, + { + "epoch": 0.15, + "grad_norm": 0.848340630531311, + "learning_rate": 9.95119608144972e-05, + "loss": 0.1178, + "step": 85 + }, + { + "epoch": 0.15, + "grad_norm": 0.7947489023208618, + "learning_rate": 9.94988816011873e-05, + "loss": 0.1115, + "step": 86 + }, + { + "epoch": 0.15, + "grad_norm": 0.33932074904441833, + "learning_rate": 9.94856303218248e-05, + "loss": 0.0546, + "step": 87 + }, + { + "epoch": 0.16, + "grad_norm": 0.26873940229415894, + "learning_rate": 9.947220702247329e-05, + "loss": 0.0873, + "step": 88 + }, + { + "epoch": 0.16, + "grad_norm": 0.3373044431209564, + "learning_rate": 9.945861174979429e-05, + "loss": 0.1051, + "step": 89 + }, + { + "epoch": 0.16, + "grad_norm": 0.24391719698905945, + "learning_rate": 9.944484455104717e-05, + "loss": 0.0986, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 0.4301680326461792, + "learning_rate": 9.943090547408888e-05, + "loss": 0.1524, + "step": 91 + }, + { + "epoch": 0.16, + "grad_norm": 0.4246341288089752, + "learning_rate": 9.941679456737394e-05, + "loss": 0.1619, + "step": 92 + }, + { + "epoch": 0.16, + "grad_norm": 0.2680893838405609, + "learning_rate": 9.940251187995411e-05, + "loss": 0.1187, + "step": 93 + }, + { + "epoch": 0.17, + "grad_norm": 0.18920297920703888, + "learning_rate": 9.938805746147827e-05, + "loss": 0.105, + "step": 94 + }, + { + "epoch": 0.17, + "grad_norm": 0.22168701887130737, + "learning_rate": 9.937343136219233e-05, + "loss": 0.0856, + "step": 95 + }, + { + "epoch": 0.17, + "grad_norm": 0.2235199213027954, + "learning_rate": 9.935863363293896e-05, + "loss": 0.1026, + "step": 96 + }, + { + "epoch": 0.17, + "grad_norm": 0.16578496992588043, + "learning_rate": 9.93436643251574e-05, + "loss": 0.0777, + "step": 97 + }, + { + "epoch": 0.17, + "grad_norm": 0.15994016826152802, + "learning_rate": 9.932852349088342e-05, + "loss": 0.0957, + "step": 98 + }, + { + "epoch": 0.18, + "grad_norm": 0.18692170083522797, + "learning_rate": 9.931321118274897e-05, + "loss": 0.0913, + "step": 99 + }, + { + "epoch": 0.18, + "grad_norm": 0.15477485954761505, + "learning_rate": 9.929772745398206e-05, + "loss": 0.0911, + "step": 100 + }, + { + "epoch": 0.18, + "grad_norm": 0.28473320603370667, + "learning_rate": 9.928207235840664e-05, + "loss": 0.1283, + "step": 101 + }, + { + "epoch": 0.18, + "grad_norm": 0.38557159900665283, + "learning_rate": 9.926624595044234e-05, + "loss": 0.1125, + "step": 102 + }, + { + "epoch": 0.18, + "grad_norm": 0.13523289561271667, + "learning_rate": 9.925024828510427e-05, + "loss": 0.0555, + "step": 103 + }, + { + "epoch": 0.18, + "grad_norm": 0.15305563807487488, + "learning_rate": 9.923407941800291e-05, + "loss": 0.1003, + "step": 104 + }, + { + "epoch": 0.19, + "grad_norm": 0.21130621433258057, + "learning_rate": 9.921773940534382e-05, + "loss": 0.0945, + "step": 105 + }, + { + "epoch": 0.19, + "grad_norm": 0.2301904559135437, + "learning_rate": 9.920122830392748e-05, + "loss": 0.1019, + "step": 106 + }, + { + "epoch": 0.19, + "grad_norm": 0.16425654292106628, + "learning_rate": 9.918454617114918e-05, + "loss": 0.0781, + "step": 107 + }, + { + "epoch": 0.19, + "grad_norm": 0.2672991454601288, + "learning_rate": 9.916769306499866e-05, + "loss": 0.085, + "step": 108 + }, + { + "epoch": 0.19, + "grad_norm": 0.3746218681335449, + "learning_rate": 9.915066904406e-05, + "loss": 0.1698, + "step": 109 + }, + { + "epoch": 0.19, + "grad_norm": 0.1691233068704605, + "learning_rate": 9.913347416751148e-05, + "loss": 0.046, + "step": 110 + }, + { + "epoch": 0.2, + "grad_norm": 0.3089153468608856, + "learning_rate": 9.91161084951252e-05, + "loss": 0.131, + "step": 111 + }, + { + "epoch": 0.2, + "grad_norm": 0.1581045240163803, + "learning_rate": 9.909857208726705e-05, + "loss": 0.0654, + "step": 112 + }, + { + "epoch": 0.2, + "grad_norm": 0.2545772194862366, + "learning_rate": 9.908086500489637e-05, + "loss": 0.1021, + "step": 113 + }, + { + "epoch": 0.2, + "grad_norm": 0.2257249355316162, + "learning_rate": 9.906298730956586e-05, + "loss": 0.0636, + "step": 114 + }, + { + "epoch": 0.2, + "grad_norm": 0.17862719297409058, + "learning_rate": 9.904493906342123e-05, + "loss": 0.0942, + "step": 115 + }, + { + "epoch": 0.21, + "grad_norm": 0.23423053324222565, + "learning_rate": 9.902672032920106e-05, + "loss": 0.0676, + "step": 116 + }, + { + "epoch": 0.21, + "grad_norm": 0.26653358340263367, + "learning_rate": 9.900833117023664e-05, + "loss": 0.0918, + "step": 117 + }, + { + "epoch": 0.21, + "grad_norm": 0.4517073631286621, + "learning_rate": 9.89897716504516e-05, + "loss": 0.1102, + "step": 118 + }, + { + "epoch": 0.21, + "grad_norm": 0.20187437534332275, + "learning_rate": 9.897104183436183e-05, + "loss": 0.0713, + "step": 119 + }, + { + "epoch": 0.21, + "grad_norm": 0.5759711861610413, + "learning_rate": 9.895214178707516e-05, + "loss": 0.0837, + "step": 120 + }, + { + "epoch": 0.21, + "grad_norm": 0.43704915046691895, + "learning_rate": 9.89330715742912e-05, + "loss": 0.0868, + "step": 121 + }, + { + "epoch": 0.22, + "grad_norm": 0.30784374475479126, + "learning_rate": 9.891383126230104e-05, + "loss": 0.1171, + "step": 122 + }, + { + "epoch": 0.22, + "grad_norm": 0.23538921773433685, + "learning_rate": 9.889442091798712e-05, + "loss": 0.055, + "step": 123 + }, + { + "epoch": 0.22, + "grad_norm": 0.27727362513542175, + "learning_rate": 9.887484060882291e-05, + "loss": 0.041, + "step": 124 + }, + { + "epoch": 0.22, + "grad_norm": 0.21666617691516876, + "learning_rate": 9.885509040287268e-05, + "loss": 0.0624, + "step": 125 + }, + { + "epoch": 0.22, + "grad_norm": 0.2829636335372925, + "learning_rate": 9.883517036879132e-05, + "loss": 0.0946, + "step": 126 + }, + { + "epoch": 0.22, + "grad_norm": 0.34035512804985046, + "learning_rate": 9.88150805758241e-05, + "loss": 0.0635, + "step": 127 + }, + { + "epoch": 0.23, + "grad_norm": 0.44064444303512573, + "learning_rate": 9.879482109380634e-05, + "loss": 0.0931, + "step": 128 + }, + { + "epoch": 0.23, + "grad_norm": 0.5164741277694702, + "learning_rate": 9.877439199316323e-05, + "loss": 0.0891, + "step": 129 + }, + { + "epoch": 0.23, + "grad_norm": 0.5549228191375732, + "learning_rate": 9.875379334490962e-05, + "loss": 0.1144, + "step": 130 + }, + { + "epoch": 0.23, + "grad_norm": 0.41133901476860046, + "learning_rate": 9.873302522064972e-05, + "loss": 0.1022, + "step": 131 + }, + { + "epoch": 0.23, + "grad_norm": 0.5204330682754517, + "learning_rate": 9.871208769257685e-05, + "loss": 0.0867, + "step": 132 + }, + { + "epoch": 0.24, + "grad_norm": 0.3383274972438812, + "learning_rate": 9.869098083347323e-05, + "loss": 0.0558, + "step": 133 + }, + { + "epoch": 0.24, + "grad_norm": 0.7084139585494995, + "learning_rate": 9.866970471670967e-05, + "loss": 0.1208, + "step": 134 + }, + { + "epoch": 0.24, + "grad_norm": 0.2974587380886078, + "learning_rate": 9.864825941624537e-05, + "loss": 0.1199, + "step": 135 + }, + { + "epoch": 0.24, + "grad_norm": 0.1811504065990448, + "learning_rate": 9.862664500662764e-05, + "loss": 0.1025, + "step": 136 + }, + { + "epoch": 0.24, + "grad_norm": 0.23748300969600677, + "learning_rate": 9.860486156299164e-05, + "loss": 0.0864, + "step": 137 + }, + { + "epoch": 0.24, + "grad_norm": 0.21784676611423492, + "learning_rate": 9.85829091610601e-05, + "loss": 0.095, + "step": 138 + }, + { + "epoch": 0.25, + "grad_norm": 0.26979225873947144, + "learning_rate": 9.856078787714309e-05, + "loss": 0.0864, + "step": 139 + }, + { + "epoch": 0.25, + "grad_norm": 0.1479984074831009, + "learning_rate": 9.853849778813777e-05, + "loss": 0.0904, + "step": 140 + }, + { + "epoch": 0.25, + "grad_norm": 0.17924343049526215, + "learning_rate": 9.851603897152803e-05, + "loss": 0.0752, + "step": 141 + }, + { + "epoch": 0.25, + "grad_norm": 0.16448016464710236, + "learning_rate": 9.849341150538434e-05, + "loss": 0.0781, + "step": 142 + }, + { + "epoch": 0.25, + "eval_loss": 0.0848281979560852, + "eval_runtime": 14.6961, + "eval_samples_per_second": 32.458, + "eval_steps_per_second": 8.165, + "step": 142 + }, + { + "epoch": 0.25, + "grad_norm": 0.14405055344104767, + "learning_rate": 9.847061546836339e-05, + "loss": 0.1007, + "step": 143 + }, + { + "epoch": 0.25, + "grad_norm": 0.20907168090343475, + "learning_rate": 9.844765093970787e-05, + "loss": 0.1126, + "step": 144 + }, + { + "epoch": 0.26, + "grad_norm": 0.1777975857257843, + "learning_rate": 9.842451799924616e-05, + "loss": 0.0928, + "step": 145 + }, + { + "epoch": 0.26, + "grad_norm": 0.1817995309829712, + "learning_rate": 9.840121672739208e-05, + "loss": 0.046, + "step": 146 + }, + { + "epoch": 0.26, + "grad_norm": 0.2099136859178543, + "learning_rate": 9.837774720514457e-05, + "loss": 0.1032, + "step": 147 + }, + { + "epoch": 0.26, + "grad_norm": 0.19467169046401978, + "learning_rate": 9.835410951408748e-05, + "loss": 0.0913, + "step": 148 + }, + { + "epoch": 0.26, + "grad_norm": 0.24700500071048737, + "learning_rate": 9.833030373638919e-05, + "loss": 0.1101, + "step": 149 + }, + { + "epoch": 0.27, + "grad_norm": 0.22854459285736084, + "learning_rate": 9.830632995480242e-05, + "loss": 0.0729, + "step": 150 + }, + { + "epoch": 0.27, + "grad_norm": 0.206742063164711, + "learning_rate": 9.828218825266388e-05, + "loss": 0.0861, + "step": 151 + }, + { + "epoch": 0.27, + "grad_norm": 0.40378740429878235, + "learning_rate": 9.8257878713894e-05, + "loss": 0.0948, + "step": 152 + }, + { + "epoch": 0.27, + "grad_norm": 0.5055291652679443, + "learning_rate": 9.823340142299662e-05, + "loss": 0.193, + "step": 153 + }, + { + "epoch": 0.27, + "grad_norm": 0.3036790192127228, + "learning_rate": 9.820875646505874e-05, + "loss": 0.0859, + "step": 154 + }, + { + "epoch": 0.27, + "grad_norm": 0.1878231018781662, + "learning_rate": 9.818394392575019e-05, + "loss": 0.0702, + "step": 155 + }, + { + "epoch": 0.28, + "grad_norm": 0.17990007996559143, + "learning_rate": 9.815896389132333e-05, + "loss": 0.0967, + "step": 156 + }, + { + "epoch": 0.28, + "grad_norm": 0.22680750489234924, + "learning_rate": 9.813381644861277e-05, + "loss": 0.0959, + "step": 157 + }, + { + "epoch": 0.28, + "grad_norm": 0.299663782119751, + "learning_rate": 9.810850168503506e-05, + "loss": 0.0801, + "step": 158 + }, + { + "epoch": 0.28, + "grad_norm": 0.3132835924625397, + "learning_rate": 9.808301968858837e-05, + "loss": 0.1151, + "step": 159 + }, + { + "epoch": 0.28, + "grad_norm": 0.16891297698020935, + "learning_rate": 9.805737054785222e-05, + "loss": 0.0799, + "step": 160 + }, + { + "epoch": 0.28, + "grad_norm": 0.23542983829975128, + "learning_rate": 9.803155435198712e-05, + "loss": 0.0645, + "step": 161 + }, + { + "epoch": 0.29, + "grad_norm": 0.1784803569316864, + "learning_rate": 9.800557119073433e-05, + "loss": 0.0475, + "step": 162 + }, + { + "epoch": 0.29, + "grad_norm": 0.37001606822013855, + "learning_rate": 9.797942115441545e-05, + "loss": 0.1331, + "step": 163 + }, + { + "epoch": 0.29, + "grad_norm": 0.18926851451396942, + "learning_rate": 9.795310433393226e-05, + "loss": 0.0744, + "step": 164 + }, + { + "epoch": 0.29, + "grad_norm": 0.15572589635849, + "learning_rate": 9.792662082076618e-05, + "loss": 0.0551, + "step": 165 + }, + { + "epoch": 0.29, + "grad_norm": 0.2562514841556549, + "learning_rate": 9.789997070697821e-05, + "loss": 0.106, + "step": 166 + }, + { + "epoch": 0.3, + "grad_norm": 0.4756919741630554, + "learning_rate": 9.787315408520838e-05, + "loss": 0.1229, + "step": 167 + }, + { + "epoch": 0.3, + "grad_norm": 0.2322833091020584, + "learning_rate": 9.78461710486756e-05, + "loss": 0.1212, + "step": 168 + }, + { + "epoch": 0.3, + "grad_norm": 0.28180065751075745, + "learning_rate": 9.78190216911772e-05, + "loss": 0.0855, + "step": 169 + }, + { + "epoch": 0.3, + "grad_norm": 0.2058788239955902, + "learning_rate": 9.779170610708872e-05, + "loss": 0.0445, + "step": 170 + }, + { + "epoch": 0.3, + "grad_norm": 0.16885802149772644, + "learning_rate": 9.776422439136352e-05, + "loss": 0.0619, + "step": 171 + }, + { + "epoch": 0.3, + "grad_norm": 0.2562006413936615, + "learning_rate": 9.773657663953243e-05, + "loss": 0.1111, + "step": 172 + }, + { + "epoch": 0.31, + "grad_norm": 0.2394249439239502, + "learning_rate": 9.770876294770349e-05, + "loss": 0.0562, + "step": 173 + }, + { + "epoch": 0.31, + "grad_norm": 0.214800164103508, + "learning_rate": 9.768078341256155e-05, + "loss": 0.0428, + "step": 174 + }, + { + "epoch": 0.31, + "grad_norm": 0.30056923627853394, + "learning_rate": 9.765263813136796e-05, + "loss": 0.1173, + "step": 175 + }, + { + "epoch": 0.31, + "grad_norm": 0.3174525499343872, + "learning_rate": 9.762432720196024e-05, + "loss": 0.0871, + "step": 176 + }, + { + "epoch": 0.31, + "grad_norm": 0.31560906767845154, + "learning_rate": 9.75958507227517e-05, + "loss": 0.1133, + "step": 177 + }, + { + "epoch": 0.31, + "grad_norm": 0.21752357482910156, + "learning_rate": 9.756720879273117e-05, + "loss": 0.0421, + "step": 178 + }, + { + "epoch": 0.32, + "grad_norm": 0.3215218484401703, + "learning_rate": 9.753840151146259e-05, + "loss": 0.0596, + "step": 179 + }, + { + "epoch": 0.32, + "grad_norm": 0.3161137104034424, + "learning_rate": 9.750942897908468e-05, + "loss": 0.122, + "step": 180 + }, + { + "epoch": 0.32, + "grad_norm": 0.4206744134426117, + "learning_rate": 9.748029129631062e-05, + "loss": 0.0966, + "step": 181 + }, + { + "epoch": 0.32, + "grad_norm": 0.28242579102516174, + "learning_rate": 9.745098856442768e-05, + "loss": 0.0853, + "step": 182 + }, + { + "epoch": 0.32, + "grad_norm": 0.24647079408168793, + "learning_rate": 9.742152088529684e-05, + "loss": 0.1077, + "step": 183 + }, + { + "epoch": 0.33, + "grad_norm": 0.29940977692604065, + "learning_rate": 9.739188836135247e-05, + "loss": 0.0837, + "step": 184 + }, + { + "epoch": 0.33, + "grad_norm": 0.21811984479427338, + "learning_rate": 9.7362091095602e-05, + "loss": 0.1, + "step": 185 + }, + { + "epoch": 0.33, + "grad_norm": 0.24434742331504822, + "learning_rate": 9.733212919162549e-05, + "loss": 0.0839, + "step": 186 + }, + { + "epoch": 0.33, + "grad_norm": 0.1656690537929535, + "learning_rate": 9.730200275357535e-05, + "loss": 0.0894, + "step": 187 + }, + { + "epoch": 0.33, + "grad_norm": 0.16984042525291443, + "learning_rate": 9.727171188617587e-05, + "loss": 0.0732, + "step": 188 + }, + { + "epoch": 0.33, + "grad_norm": 0.19889003038406372, + "learning_rate": 9.7241256694723e-05, + "loss": 0.0832, + "step": 189 + }, + { + "epoch": 0.34, + "grad_norm": 0.1639273464679718, + "learning_rate": 9.721063728508383e-05, + "loss": 0.0912, + "step": 190 + }, + { + "epoch": 0.34, + "grad_norm": 0.26211172342300415, + "learning_rate": 9.717985376369639e-05, + "loss": 0.0986, + "step": 191 + }, + { + "epoch": 0.34, + "grad_norm": 0.12403538823127747, + "learning_rate": 9.714890623756912e-05, + "loss": 0.0844, + "step": 192 + }, + { + "epoch": 0.34, + "grad_norm": 0.1882586032152176, + "learning_rate": 9.711779481428057e-05, + "loss": 0.1163, + "step": 193 + }, + { + "epoch": 0.34, + "grad_norm": 0.1780715435743332, + "learning_rate": 9.708651960197904e-05, + "loss": 0.1038, + "step": 194 + }, + { + "epoch": 0.34, + "grad_norm": 0.1291002333164215, + "learning_rate": 9.705508070938218e-05, + "loss": 0.0746, + "step": 195 + }, + { + "epoch": 0.35, + "grad_norm": 0.2384466975927353, + "learning_rate": 9.702347824577666e-05, + "loss": 0.0909, + "step": 196 + }, + { + "epoch": 0.35, + "grad_norm": 0.25463247299194336, + "learning_rate": 9.699171232101768e-05, + "loss": 0.0977, + "step": 197 + }, + { + "epoch": 0.35, + "grad_norm": 0.19303986430168152, + "learning_rate": 9.69597830455287e-05, + "loss": 0.1137, + "step": 198 + }, + { + "epoch": 0.35, + "grad_norm": 0.21899022161960602, + "learning_rate": 9.692769053030099e-05, + "loss": 0.0671, + "step": 199 + }, + { + "epoch": 0.35, + "grad_norm": 0.16923530399799347, + "learning_rate": 9.689543488689332e-05, + "loss": 0.0776, + "step": 200 + }, + { + "epoch": 0.36, + "grad_norm": 0.260955810546875, + "learning_rate": 9.686301622743144e-05, + "loss": 0.092, + "step": 201 + }, + { + "epoch": 0.36, + "grad_norm": 0.1771455854177475, + "learning_rate": 9.683043466460782e-05, + "loss": 0.06, + "step": 202 + }, + { + "epoch": 0.36, + "grad_norm": 0.18851810693740845, + "learning_rate": 9.67976903116812e-05, + "loss": 0.0965, + "step": 203 + }, + { + "epoch": 0.36, + "grad_norm": 0.589522659778595, + "learning_rate": 9.676478328247622e-05, + "loss": 0.1673, + "step": 204 + }, + { + "epoch": 0.36, + "grad_norm": 0.21747058629989624, + "learning_rate": 9.673171369138296e-05, + "loss": 0.0997, + "step": 205 + }, + { + "epoch": 0.36, + "grad_norm": 0.4168107807636261, + "learning_rate": 9.669848165335666e-05, + "loss": 0.0795, + "step": 206 + }, + { + "epoch": 0.37, + "grad_norm": 0.6520416140556335, + "learning_rate": 9.666508728391719e-05, + "loss": 0.1177, + "step": 207 + }, + { + "epoch": 0.37, + "grad_norm": 0.3752453029155731, + "learning_rate": 9.663153069914875e-05, + "loss": 0.0871, + "step": 208 + }, + { + "epoch": 0.37, + "grad_norm": 0.26546868681907654, + "learning_rate": 9.65978120156994e-05, + "loss": 0.0647, + "step": 209 + }, + { + "epoch": 0.37, + "grad_norm": 0.20044176280498505, + "learning_rate": 9.656393135078068e-05, + "loss": 0.1072, + "step": 210 + }, + { + "epoch": 0.37, + "grad_norm": 0.25033503770828247, + "learning_rate": 9.652988882216724e-05, + "loss": 0.1326, + "step": 211 + }, + { + "epoch": 0.38, + "grad_norm": 0.24569682776927948, + "learning_rate": 9.649568454819637e-05, + "loss": 0.0931, + "step": 212 + }, + { + "epoch": 0.38, + "grad_norm": 0.5623157620429993, + "learning_rate": 9.64613186477676e-05, + "loss": 0.2157, + "step": 213 + }, + { + "epoch": 0.38, + "grad_norm": 0.15052182972431183, + "learning_rate": 9.642679124034233e-05, + "loss": 0.1236, + "step": 214 + }, + { + "epoch": 0.38, + "grad_norm": 0.14209671318531036, + "learning_rate": 9.639210244594334e-05, + "loss": 0.0971, + "step": 215 + }, + { + "epoch": 0.38, + "grad_norm": 0.1627768725156784, + "learning_rate": 9.635725238515445e-05, + "loss": 0.1161, + "step": 216 + }, + { + "epoch": 0.38, + "grad_norm": 0.10190293937921524, + "learning_rate": 9.63222411791201e-05, + "loss": 0.0999, + "step": 217 + }, + { + "epoch": 0.39, + "grad_norm": 0.11575043201446533, + "learning_rate": 9.62870689495448e-05, + "loss": 0.0986, + "step": 218 + }, + { + "epoch": 0.39, + "grad_norm": 0.09842410683631897, + "learning_rate": 9.62517358186929e-05, + "loss": 0.1176, + "step": 219 + }, + { + "epoch": 0.39, + "grad_norm": 0.14816004037857056, + "learning_rate": 9.621624190938803e-05, + "loss": 0.0833, + "step": 220 + }, + { + "epoch": 0.39, + "grad_norm": 0.11311839520931244, + "learning_rate": 9.618058734501269e-05, + "loss": 0.0815, + "step": 221 + }, + { + "epoch": 0.39, + "grad_norm": 0.17481163144111633, + "learning_rate": 9.614477224950789e-05, + "loss": 0.0678, + "step": 222 + }, + { + "epoch": 0.39, + "grad_norm": 0.2277013659477234, + "learning_rate": 9.610879674737264e-05, + "loss": 0.0941, + "step": 223 + }, + { + "epoch": 0.4, + "grad_norm": 0.14689870178699493, + "learning_rate": 9.607266096366352e-05, + "loss": 0.0991, + "step": 224 + }, + { + "epoch": 0.4, + "grad_norm": 0.24558769166469574, + "learning_rate": 9.603636502399436e-05, + "loss": 0.0878, + "step": 225 + }, + { + "epoch": 0.4, + "grad_norm": 0.1541660875082016, + "learning_rate": 9.599990905453567e-05, + "loss": 0.0784, + "step": 226 + }, + { + "epoch": 0.4, + "grad_norm": 0.12188339233398438, + "learning_rate": 9.59632931820142e-05, + "loss": 0.0464, + "step": 227 + }, + { + "epoch": 0.4, + "grad_norm": 0.32710394263267517, + "learning_rate": 9.592651753371265e-05, + "loss": 0.0541, + "step": 228 + }, + { + "epoch": 0.41, + "grad_norm": 0.3118465840816498, + "learning_rate": 9.588958223746903e-05, + "loss": 0.0845, + "step": 229 + }, + { + "epoch": 0.41, + "grad_norm": 0.26805219054222107, + "learning_rate": 9.585248742167639e-05, + "loss": 0.0485, + "step": 230 + }, + { + "epoch": 0.41, + "grad_norm": 0.7972936630249023, + "learning_rate": 9.581523321528223e-05, + "loss": 0.1013, + "step": 231 + }, + { + "epoch": 0.41, + "grad_norm": 0.6285438537597656, + "learning_rate": 9.577781974778817e-05, + "loss": 0.0767, + "step": 232 + }, + { + "epoch": 0.41, + "grad_norm": 0.6384493708610535, + "learning_rate": 9.57402471492494e-05, + "loss": 0.1855, + "step": 233 + }, + { + "epoch": 0.41, + "grad_norm": 0.5759001970291138, + "learning_rate": 9.570251555027432e-05, + "loss": 0.1585, + "step": 234 + }, + { + "epoch": 0.42, + "grad_norm": 0.42002353072166443, + "learning_rate": 9.566462508202402e-05, + "loss": 0.1479, + "step": 235 + }, + { + "epoch": 0.42, + "grad_norm": 0.18405884504318237, + "learning_rate": 9.562657587621184e-05, + "loss": 0.09, + "step": 236 + }, + { + "epoch": 0.42, + "grad_norm": 0.20893922448158264, + "learning_rate": 9.558836806510291e-05, + "loss": 0.0685, + "step": 237 + }, + { + "epoch": 0.42, + "grad_norm": 0.31388092041015625, + "learning_rate": 9.555000178151374e-05, + "loss": 0.0983, + "step": 238 + }, + { + "epoch": 0.42, + "grad_norm": 0.20344533026218414, + "learning_rate": 9.551147715881166e-05, + "loss": 0.0944, + "step": 239 + }, + { + "epoch": 0.42, + "grad_norm": 0.1582648903131485, + "learning_rate": 9.547279433091446e-05, + "loss": 0.0662, + "step": 240 + }, + { + "epoch": 0.43, + "grad_norm": 0.16737405955791473, + "learning_rate": 9.543395343228983e-05, + "loss": 0.1565, + "step": 241 + }, + { + "epoch": 0.43, + "grad_norm": 0.21974924206733704, + "learning_rate": 9.539495459795499e-05, + "loss": 0.1243, + "step": 242 + }, + { + "epoch": 0.43, + "grad_norm": 0.1147058829665184, + "learning_rate": 9.535579796347612e-05, + "loss": 0.0727, + "step": 243 + }, + { + "epoch": 0.43, + "grad_norm": 0.13460345566272736, + "learning_rate": 9.531648366496799e-05, + "loss": 0.0691, + "step": 244 + }, + { + "epoch": 0.43, + "grad_norm": 0.1404263824224472, + "learning_rate": 9.527701183909336e-05, + "loss": 0.0975, + "step": 245 + }, + { + "epoch": 0.44, + "grad_norm": 0.17380090057849884, + "learning_rate": 9.523738262306269e-05, + "loss": 0.0873, + "step": 246 + }, + { + "epoch": 0.44, + "grad_norm": 0.13862797617912292, + "learning_rate": 9.519759615463346e-05, + "loss": 0.0738, + "step": 247 + }, + { + "epoch": 0.44, + "grad_norm": 0.17551685869693756, + "learning_rate": 9.51576525721098e-05, + "loss": 0.0676, + "step": 248 + }, + { + "epoch": 0.44, + "grad_norm": 0.20715269446372986, + "learning_rate": 9.511755201434205e-05, + "loss": 0.0737, + "step": 249 + }, + { + "epoch": 0.44, + "grad_norm": 0.14763356745243073, + "learning_rate": 9.507729462072614e-05, + "loss": 0.07, + "step": 250 + }, + { + "epoch": 0.44, + "grad_norm": 0.317452073097229, + "learning_rate": 9.503688053120327e-05, + "loss": 0.1252, + "step": 251 + }, + { + "epoch": 0.45, + "grad_norm": 0.21908459067344666, + "learning_rate": 9.499630988625925e-05, + "loss": 0.0877, + "step": 252 + }, + { + "epoch": 0.45, + "grad_norm": 0.3233601450920105, + "learning_rate": 9.49555828269242e-05, + "loss": 0.0891, + "step": 253 + }, + { + "epoch": 0.45, + "grad_norm": 0.4098372161388397, + "learning_rate": 9.491469949477187e-05, + "loss": 0.0805, + "step": 254 + }, + { + "epoch": 0.45, + "grad_norm": 0.40573808550834656, + "learning_rate": 9.487366003191931e-05, + "loss": 0.1284, + "step": 255 + }, + { + "epoch": 0.45, + "grad_norm": 0.3391616940498352, + "learning_rate": 9.483246458102625e-05, + "loss": 0.0901, + "step": 256 + }, + { + "epoch": 0.45, + "grad_norm": 0.1822938323020935, + "learning_rate": 9.479111328529473e-05, + "loss": 0.0398, + "step": 257 + }, + { + "epoch": 0.46, + "grad_norm": 0.4700302183628082, + "learning_rate": 9.474960628846843e-05, + "loss": 0.1509, + "step": 258 + }, + { + "epoch": 0.46, + "grad_norm": 0.20210890471935272, + "learning_rate": 9.470794373483236e-05, + "loss": 0.0765, + "step": 259 + }, + { + "epoch": 0.46, + "grad_norm": 0.28329914808273315, + "learning_rate": 9.466612576921223e-05, + "loss": 0.0666, + "step": 260 + }, + { + "epoch": 0.46, + "grad_norm": 0.41083166003227234, + "learning_rate": 9.462415253697401e-05, + "loss": 0.1248, + "step": 261 + }, + { + "epoch": 0.46, + "grad_norm": 0.17644570767879486, + "learning_rate": 9.458202418402338e-05, + "loss": 0.0532, + "step": 262 + }, + { + "epoch": 0.47, + "grad_norm": 0.2667219936847687, + "learning_rate": 9.453974085680526e-05, + "loss": 0.0937, + "step": 263 + }, + { + "epoch": 0.47, + "grad_norm": 0.20900332927703857, + "learning_rate": 9.449730270230326e-05, + "loss": 0.0853, + "step": 264 + }, + { + "epoch": 0.47, + "grad_norm": 0.26425743103027344, + "learning_rate": 9.445470986803922e-05, + "loss": 0.12, + "step": 265 + }, + { + "epoch": 0.47, + "grad_norm": 0.1956167221069336, + "learning_rate": 9.441196250207267e-05, + "loss": 0.0965, + "step": 266 + }, + { + "epoch": 0.47, + "grad_norm": 0.21896903216838837, + "learning_rate": 9.436906075300032e-05, + "loss": 0.0867, + "step": 267 + }, + { + "epoch": 0.47, + "grad_norm": 0.2082919031381607, + "learning_rate": 9.432600476995551e-05, + "loss": 0.0847, + "step": 268 + }, + { + "epoch": 0.48, + "grad_norm": 0.1674569696187973, + "learning_rate": 9.428279470260776e-05, + "loss": 0.0846, + "step": 269 + }, + { + "epoch": 0.48, + "grad_norm": 0.23109744489192963, + "learning_rate": 9.423943070116218e-05, + "loss": 0.136, + "step": 270 + }, + { + "epoch": 0.48, + "grad_norm": 0.21344415843486786, + "learning_rate": 9.4195912916359e-05, + "loss": 0.1091, + "step": 271 + }, + { + "epoch": 0.48, + "grad_norm": 0.16391590237617493, + "learning_rate": 9.415224149947306e-05, + "loss": 0.0901, + "step": 272 + }, + { + "epoch": 0.48, + "grad_norm": 0.2023243010044098, + "learning_rate": 9.410841660231315e-05, + "loss": 0.0635, + "step": 273 + }, + { + "epoch": 0.48, + "grad_norm": 0.1723608821630478, + "learning_rate": 9.406443837722168e-05, + "loss": 0.1001, + "step": 274 + }, + { + "epoch": 0.49, + "grad_norm": 0.1470147669315338, + "learning_rate": 9.402030697707398e-05, + "loss": 0.0721, + "step": 275 + }, + { + "epoch": 0.49, + "grad_norm": 0.15082985162734985, + "learning_rate": 9.397602255527791e-05, + "loss": 0.0698, + "step": 276 + }, + { + "epoch": 0.49, + "grad_norm": 0.16322006285190582, + "learning_rate": 9.393158526577323e-05, + "loss": 0.0809, + "step": 277 + }, + { + "epoch": 0.49, + "grad_norm": 0.10098633915185928, + "learning_rate": 9.388699526303105e-05, + "loss": 0.0386, + "step": 278 + }, + { + "epoch": 0.49, + "grad_norm": 0.19049708545207977, + "learning_rate": 9.38422527020534e-05, + "loss": 0.0559, + "step": 279 + }, + { + "epoch": 0.5, + "grad_norm": 0.22742775082588196, + "learning_rate": 9.37973577383726e-05, + "loss": 0.0802, + "step": 280 + }, + { + "epoch": 0.5, + "grad_norm": 0.2055177539587021, + "learning_rate": 9.375231052805072e-05, + "loss": 0.1048, + "step": 281 + }, + { + "epoch": 0.5, + "grad_norm": 0.1366245150566101, + "learning_rate": 9.370711122767913e-05, + "loss": 0.0204, + "step": 282 + }, + { + "epoch": 0.5, + "grad_norm": 0.3235447406768799, + "learning_rate": 9.36617599943778e-05, + "loss": 0.0974, + "step": 283 + }, + { + "epoch": 0.5, + "grad_norm": 0.09579204767942429, + "learning_rate": 9.361625698579493e-05, + "loss": 0.0151, + "step": 284 + }, + { + "epoch": 0.5, + "eval_loss": 0.07987037301063538, + "eval_runtime": 14.6437, + "eval_samples_per_second": 32.574, + "eval_steps_per_second": 8.195, + "step": 284 + }, + { + "epoch": 0.5, + "grad_norm": 0.22850771248340607, + "learning_rate": 9.357060236010625e-05, + "loss": 0.0458, + "step": 285 + }, + { + "epoch": 0.51, + "grad_norm": 0.4980478882789612, + "learning_rate": 9.352479627601457e-05, + "loss": 0.1306, + "step": 286 + }, + { + "epoch": 0.51, + "grad_norm": 0.18770304322242737, + "learning_rate": 9.347883889274923e-05, + "loss": 0.0218, + "step": 287 + }, + { + "epoch": 0.51, + "grad_norm": 0.6386083364486694, + "learning_rate": 9.34327303700654e-05, + "loss": 0.0912, + "step": 288 + }, + { + "epoch": 0.51, + "grad_norm": 0.4997164309024811, + "learning_rate": 9.338647086824372e-05, + "loss": 0.1083, + "step": 289 + }, + { + "epoch": 0.51, + "grad_norm": 0.31682559847831726, + "learning_rate": 9.334006054808966e-05, + "loss": 0.0947, + "step": 290 + }, + { + "epoch": 0.51, + "grad_norm": 0.28325051069259644, + "learning_rate": 9.329349957093292e-05, + "loss": 0.0794, + "step": 291 + }, + { + "epoch": 0.52, + "grad_norm": 0.5778185725212097, + "learning_rate": 9.324678809862695e-05, + "loss": 0.1223, + "step": 292 + }, + { + "epoch": 0.52, + "grad_norm": 0.2953624725341797, + "learning_rate": 9.319992629354828e-05, + "loss": 0.0747, + "step": 293 + }, + { + "epoch": 0.52, + "grad_norm": 0.28283196687698364, + "learning_rate": 9.31529143185961e-05, + "loss": 0.1099, + "step": 294 + }, + { + "epoch": 0.52, + "grad_norm": 0.36138102412223816, + "learning_rate": 9.310575233719154e-05, + "loss": 0.1303, + "step": 295 + }, + { + "epoch": 0.52, + "grad_norm": 0.16202205419540405, + "learning_rate": 9.305844051327725e-05, + "loss": 0.0805, + "step": 296 + }, + { + "epoch": 0.53, + "grad_norm": 0.11523901671171188, + "learning_rate": 9.30109790113167e-05, + "loss": 0.0775, + "step": 297 + }, + { + "epoch": 0.53, + "grad_norm": 0.10756238549947739, + "learning_rate": 9.296336799629369e-05, + "loss": 0.0795, + "step": 298 + }, + { + "epoch": 0.53, + "grad_norm": 0.18142195045948029, + "learning_rate": 9.291560763371173e-05, + "loss": 0.0833, + "step": 299 + }, + { + "epoch": 0.53, + "grad_norm": 0.14596430957317352, + "learning_rate": 9.28676980895935e-05, + "loss": 0.0904, + "step": 300 + }, + { + "epoch": 0.53, + "grad_norm": 0.10054739564657211, + "learning_rate": 9.28196395304803e-05, + "loss": 0.0898, + "step": 301 + }, + { + "epoch": 0.53, + "grad_norm": 0.24579764902591705, + "learning_rate": 9.277143212343134e-05, + "loss": 0.145, + "step": 302 + }, + { + "epoch": 0.54, + "grad_norm": 0.13506978750228882, + "learning_rate": 9.272307603602334e-05, + "loss": 0.0847, + "step": 303 + }, + { + "epoch": 0.54, + "grad_norm": 0.17480792105197906, + "learning_rate": 9.267457143634979e-05, + "loss": 0.125, + "step": 304 + }, + { + "epoch": 0.54, + "grad_norm": 0.31458401679992676, + "learning_rate": 9.262591849302048e-05, + "loss": 0.1047, + "step": 305 + }, + { + "epoch": 0.54, + "grad_norm": 0.17494355142116547, + "learning_rate": 9.257711737516082e-05, + "loss": 0.0576, + "step": 306 + }, + { + "epoch": 0.54, + "grad_norm": 0.2996468245983124, + "learning_rate": 9.252816825241134e-05, + "loss": 0.1012, + "step": 307 + }, + { + "epoch": 0.54, + "grad_norm": 0.19122976064682007, + "learning_rate": 9.247907129492707e-05, + "loss": 0.0878, + "step": 308 + }, + { + "epoch": 0.55, + "grad_norm": 0.16079925000667572, + "learning_rate": 9.242982667337685e-05, + "loss": 0.0778, + "step": 309 + }, + { + "epoch": 0.55, + "grad_norm": 0.2628028392791748, + "learning_rate": 9.238043455894293e-05, + "loss": 0.0987, + "step": 310 + }, + { + "epoch": 0.55, + "grad_norm": 0.318097859621048, + "learning_rate": 9.23308951233202e-05, + "loss": 0.1108, + "step": 311 + }, + { + "epoch": 0.55, + "grad_norm": 0.2207389920949936, + "learning_rate": 9.228120853871571e-05, + "loss": 0.0826, + "step": 312 + }, + { + "epoch": 0.55, + "grad_norm": 0.34375905990600586, + "learning_rate": 9.223137497784797e-05, + "loss": 0.1174, + "step": 313 + }, + { + "epoch": 0.56, + "grad_norm": 0.16714760661125183, + "learning_rate": 9.218139461394644e-05, + "loss": 0.0883, + "step": 314 + }, + { + "epoch": 0.56, + "grad_norm": 0.24213539063930511, + "learning_rate": 9.213126762075088e-05, + "loss": 0.0686, + "step": 315 + }, + { + "epoch": 0.56, + "grad_norm": 0.2654499113559723, + "learning_rate": 9.208099417251077e-05, + "loss": 0.1185, + "step": 316 + }, + { + "epoch": 0.56, + "grad_norm": 0.1353083997964859, + "learning_rate": 9.203057444398469e-05, + "loss": 0.0806, + "step": 317 + }, + { + "epoch": 0.56, + "grad_norm": 0.30304938554763794, + "learning_rate": 9.198000861043967e-05, + "loss": 0.0817, + "step": 318 + }, + { + "epoch": 0.56, + "grad_norm": 0.14495517313480377, + "learning_rate": 9.192929684765067e-05, + "loss": 0.0436, + "step": 319 + }, + { + "epoch": 0.57, + "grad_norm": 0.2180556207895279, + "learning_rate": 9.187843933189995e-05, + "loss": 0.1255, + "step": 320 + }, + { + "epoch": 0.57, + "grad_norm": 0.3051697611808777, + "learning_rate": 9.182743623997634e-05, + "loss": 0.1241, + "step": 321 + }, + { + "epoch": 0.57, + "grad_norm": 0.42936787009239197, + "learning_rate": 9.17762877491748e-05, + "loss": 0.1847, + "step": 322 + }, + { + "epoch": 0.57, + "grad_norm": 0.20895107090473175, + "learning_rate": 9.172499403729566e-05, + "loss": 0.0939, + "step": 323 + }, + { + "epoch": 0.57, + "grad_norm": 0.15273532271385193, + "learning_rate": 9.167355528264414e-05, + "loss": 0.1012, + "step": 324 + }, + { + "epoch": 0.57, + "grad_norm": 0.15428248047828674, + "learning_rate": 9.162197166402956e-05, + "loss": 0.061, + "step": 325 + }, + { + "epoch": 0.58, + "grad_norm": 0.13089029490947723, + "learning_rate": 9.157024336076487e-05, + "loss": 0.089, + "step": 326 + }, + { + "epoch": 0.58, + "grad_norm": 0.12000248581171036, + "learning_rate": 9.151837055266594e-05, + "loss": 0.0813, + "step": 327 + }, + { + "epoch": 0.58, + "grad_norm": 0.12965545058250427, + "learning_rate": 9.146635342005099e-05, + "loss": 0.113, + "step": 328 + }, + { + "epoch": 0.58, + "grad_norm": 0.12225235253572464, + "learning_rate": 9.14141921437399e-05, + "loss": 0.0968, + "step": 329 + }, + { + "epoch": 0.58, + "grad_norm": 0.11869696527719498, + "learning_rate": 9.136188690505363e-05, + "loss": 0.0752, + "step": 330 + }, + { + "epoch": 0.59, + "grad_norm": 0.22600843012332916, + "learning_rate": 9.130943788581359e-05, + "loss": 0.1049, + "step": 331 + }, + { + "epoch": 0.59, + "grad_norm": 0.13381795585155487, + "learning_rate": 9.125684526834099e-05, + "loss": 0.0917, + "step": 332 + }, + { + "epoch": 0.59, + "grad_norm": 0.12936879694461823, + "learning_rate": 9.120410923545619e-05, + "loss": 0.0782, + "step": 333 + }, + { + "epoch": 0.59, + "grad_norm": 0.14804388582706451, + "learning_rate": 9.115122997047811e-05, + "loss": 0.0959, + "step": 334 + }, + { + "epoch": 0.59, + "grad_norm": 0.18504676222801208, + "learning_rate": 9.109820765722357e-05, + "loss": 0.1126, + "step": 335 + }, + { + "epoch": 0.59, + "grad_norm": 0.1957363486289978, + "learning_rate": 9.10450424800066e-05, + "loss": 0.101, + "step": 336 + }, + { + "epoch": 0.6, + "grad_norm": 0.15677915513515472, + "learning_rate": 9.099173462363792e-05, + "loss": 0.0775, + "step": 337 + }, + { + "epoch": 0.6, + "grad_norm": 0.124906025826931, + "learning_rate": 9.093828427342418e-05, + "loss": 0.07, + "step": 338 + }, + { + "epoch": 0.6, + "grad_norm": 0.12451624125242233, + "learning_rate": 9.088469161516735e-05, + "loss": 0.0588, + "step": 339 + }, + { + "epoch": 0.6, + "grad_norm": 0.509678304195404, + "learning_rate": 9.083095683516414e-05, + "loss": 0.1563, + "step": 340 + }, + { + "epoch": 0.6, + "grad_norm": 0.2245551496744156, + "learning_rate": 9.077708012020524e-05, + "loss": 0.1029, + "step": 341 + }, + { + "epoch": 0.61, + "grad_norm": 0.2735763192176819, + "learning_rate": 9.072306165757476e-05, + "loss": 0.0958, + "step": 342 + }, + { + "epoch": 0.61, + "grad_norm": 0.2062731385231018, + "learning_rate": 9.066890163504955e-05, + "loss": 0.0638, + "step": 343 + }, + { + "epoch": 0.61, + "grad_norm": 0.1664024293422699, + "learning_rate": 9.061460024089853e-05, + "loss": 0.0555, + "step": 344 + }, + { + "epoch": 0.61, + "grad_norm": 0.15788845717906952, + "learning_rate": 9.056015766388205e-05, + "loss": 0.0509, + "step": 345 + }, + { + "epoch": 0.61, + "grad_norm": 0.185616135597229, + "learning_rate": 9.050557409325125e-05, + "loss": 0.1196, + "step": 346 + }, + { + "epoch": 0.61, + "grad_norm": 0.24650661647319794, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0723, + "step": 347 + }, + { + "epoch": 0.62, + "grad_norm": 0.23959776759147644, + "learning_rate": 9.039598473060113e-05, + "loss": 0.1139, + "step": 348 + }, + { + "epoch": 0.62, + "grad_norm": 0.24370582401752472, + "learning_rate": 9.034097931953201e-05, + "loss": 0.0559, + "step": 349 + }, + { + "epoch": 0.62, + "grad_norm": 0.11590461432933807, + "learning_rate": 9.028583367674765e-05, + "loss": 0.0285, + "step": 350 + }, + { + "epoch": 0.62, + "grad_norm": 0.21419131755828857, + "learning_rate": 9.023054799394316e-05, + "loss": 0.0686, + "step": 351 + }, + { + "epoch": 0.62, + "grad_norm": 0.2115790992975235, + "learning_rate": 9.017512246330042e-05, + "loss": 0.071, + "step": 352 + }, + { + "epoch": 0.62, + "grad_norm": 0.2025454044342041, + "learning_rate": 9.011955727748748e-05, + "loss": 0.0993, + "step": 353 + }, + { + "epoch": 0.63, + "grad_norm": 0.22743502259254456, + "learning_rate": 9.006385262965786e-05, + "loss": 0.0705, + "step": 354 + }, + { + "epoch": 0.63, + "grad_norm": 0.16963045299053192, + "learning_rate": 9.00080087134498e-05, + "loss": 0.0569, + "step": 355 + }, + { + "epoch": 0.63, + "grad_norm": 0.12319042533636093, + "learning_rate": 8.995202572298576e-05, + "loss": 0.0427, + "step": 356 + }, + { + "epoch": 0.63, + "grad_norm": 0.15424852073192596, + "learning_rate": 8.989590385287155e-05, + "loss": 0.0564, + "step": 357 + }, + { + "epoch": 0.63, + "grad_norm": 0.306594580411911, + "learning_rate": 8.983964329819583e-05, + "loss": 0.095, + "step": 358 + }, + { + "epoch": 0.64, + "grad_norm": 0.13876177370548248, + "learning_rate": 8.978324425452931e-05, + "loss": 0.0641, + "step": 359 + }, + { + "epoch": 0.64, + "grad_norm": 0.19561870396137238, + "learning_rate": 8.972670691792409e-05, + "loss": 0.0635, + "step": 360 + }, + { + "epoch": 0.64, + "grad_norm": 0.3458711504936218, + "learning_rate": 8.967003148491304e-05, + "loss": 0.1328, + "step": 361 + }, + { + "epoch": 0.64, + "grad_norm": 0.1129189059138298, + "learning_rate": 8.961321815250905e-05, + "loss": 0.0205, + "step": 362 + }, + { + "epoch": 0.64, + "grad_norm": 0.3680332601070404, + "learning_rate": 8.955626711820438e-05, + "loss": 0.1302, + "step": 363 + }, + { + "epoch": 0.64, + "grad_norm": 0.2695287764072418, + "learning_rate": 8.949917857996996e-05, + "loss": 0.0511, + "step": 364 + }, + { + "epoch": 0.65, + "grad_norm": 0.17332953214645386, + "learning_rate": 8.94419527362547e-05, + "loss": 0.0494, + "step": 365 + }, + { + "epoch": 0.65, + "grad_norm": 0.11610284447669983, + "learning_rate": 8.938458978598483e-05, + "loss": 0.0381, + "step": 366 + }, + { + "epoch": 0.65, + "grad_norm": 0.23595061898231506, + "learning_rate": 8.932708992856315e-05, + "loss": 0.0802, + "step": 367 + }, + { + "epoch": 0.65, + "grad_norm": 0.26876452565193176, + "learning_rate": 8.926945336386838e-05, + "loss": 0.0461, + "step": 368 + }, + { + "epoch": 0.65, + "grad_norm": 0.19504375755786896, + "learning_rate": 8.921168029225448e-05, + "loss": 0.0317, + "step": 369 + }, + { + "epoch": 0.65, + "grad_norm": 0.4416268467903137, + "learning_rate": 8.915377091454992e-05, + "loss": 0.0952, + "step": 370 + }, + { + "epoch": 0.66, + "grad_norm": 0.32519325613975525, + "learning_rate": 8.909572543205698e-05, + "loss": 0.1027, + "step": 371 + }, + { + "epoch": 0.66, + "grad_norm": 0.3939536511898041, + "learning_rate": 8.903754404655106e-05, + "loss": 0.1718, + "step": 372 + }, + { + "epoch": 0.66, + "grad_norm": 0.20514678955078125, + "learning_rate": 8.897922696027999e-05, + "loss": 0.06, + "step": 373 + }, + { + "epoch": 0.66, + "grad_norm": 0.3049127459526062, + "learning_rate": 8.892077437596332e-05, + "loss": 0.1014, + "step": 374 + }, + { + "epoch": 0.66, + "grad_norm": 0.19251297414302826, + "learning_rate": 8.88621864967916e-05, + "loss": 0.048, + "step": 375 + }, + { + "epoch": 0.67, + "grad_norm": 0.21226820349693298, + "learning_rate": 8.880346352642575e-05, + "loss": 0.0652, + "step": 376 + }, + { + "epoch": 0.67, + "grad_norm": 0.42633509635925293, + "learning_rate": 8.874460566899616e-05, + "loss": 0.1083, + "step": 377 + }, + { + "epoch": 0.67, + "grad_norm": 0.24073313176631927, + "learning_rate": 8.868561312910221e-05, + "loss": 0.0851, + "step": 378 + }, + { + "epoch": 0.67, + "grad_norm": 0.3047339618206024, + "learning_rate": 8.862648611181145e-05, + "loss": 0.086, + "step": 379 + }, + { + "epoch": 0.67, + "grad_norm": 0.2227114737033844, + "learning_rate": 8.856722482265886e-05, + "loss": 0.1196, + "step": 380 + }, + { + "epoch": 0.67, + "grad_norm": 0.1860799938440323, + "learning_rate": 8.850782946764619e-05, + "loss": 0.0779, + "step": 381 + }, + { + "epoch": 0.68, + "grad_norm": 0.2109043002128601, + "learning_rate": 8.844830025324122e-05, + "loss": 0.076, + "step": 382 + }, + { + "epoch": 0.68, + "grad_norm": 0.1998620331287384, + "learning_rate": 8.838863738637706e-05, + "loss": 0.1027, + "step": 383 + }, + { + "epoch": 0.68, + "grad_norm": 0.12607474625110626, + "learning_rate": 8.832884107445139e-05, + "loss": 0.0436, + "step": 384 + }, + { + "epoch": 0.68, + "grad_norm": 0.2890150845050812, + "learning_rate": 8.826891152532579e-05, + "loss": 0.0966, + "step": 385 + }, + { + "epoch": 0.68, + "grad_norm": 0.4496447443962097, + "learning_rate": 8.820884894732497e-05, + "loss": 0.1575, + "step": 386 + }, + { + "epoch": 0.68, + "grad_norm": 0.19411596655845642, + "learning_rate": 8.814865354923613e-05, + "loss": 0.1201, + "step": 387 + }, + { + "epoch": 0.69, + "grad_norm": 0.19513021409511566, + "learning_rate": 8.808832554030808e-05, + "loss": 0.0747, + "step": 388 + }, + { + "epoch": 0.69, + "grad_norm": 0.14038780331611633, + "learning_rate": 8.802786513025068e-05, + "loss": 0.0608, + "step": 389 + }, + { + "epoch": 0.69, + "grad_norm": 0.14907363057136536, + "learning_rate": 8.796727252923402e-05, + "loss": 0.0843, + "step": 390 + }, + { + "epoch": 0.69, + "grad_norm": 0.18512780964374542, + "learning_rate": 8.790654794788769e-05, + "loss": 0.0988, + "step": 391 + }, + { + "epoch": 0.69, + "grad_norm": 0.17880797386169434, + "learning_rate": 8.784569159730007e-05, + "loss": 0.079, + "step": 392 + }, + { + "epoch": 0.7, + "grad_norm": 0.16263402998447418, + "learning_rate": 8.778470368901762e-05, + "loss": 0.0704, + "step": 393 + }, + { + "epoch": 0.7, + "grad_norm": 0.27071598172187805, + "learning_rate": 8.772358443504405e-05, + "loss": 0.0983, + "step": 394 + }, + { + "epoch": 0.7, + "grad_norm": 0.23446398973464966, + "learning_rate": 8.766233404783974e-05, + "loss": 0.0577, + "step": 395 + }, + { + "epoch": 0.7, + "grad_norm": 0.2932927906513214, + "learning_rate": 8.760095274032083e-05, + "loss": 0.0946, + "step": 396 + }, + { + "epoch": 0.7, + "grad_norm": 0.29224956035614014, + "learning_rate": 8.75394407258586e-05, + "loss": 0.078, + "step": 397 + }, + { + "epoch": 0.7, + "grad_norm": 0.15467233955860138, + "learning_rate": 8.747779821827868e-05, + "loss": 0.0779, + "step": 398 + }, + { + "epoch": 0.71, + "grad_norm": 0.1883499026298523, + "learning_rate": 8.741602543186032e-05, + "loss": 0.0721, + "step": 399 + }, + { + "epoch": 0.71, + "grad_norm": 0.25179481506347656, + "learning_rate": 8.735412258133562e-05, + "loss": 0.0875, + "step": 400 + }, + { + "epoch": 0.71, + "grad_norm": 0.23851999640464783, + "learning_rate": 8.729208988188881e-05, + "loss": 0.0959, + "step": 401 + }, + { + "epoch": 0.71, + "grad_norm": 0.2662704885005951, + "learning_rate": 8.722992754915554e-05, + "loss": 0.1025, + "step": 402 + }, + { + "epoch": 0.71, + "grad_norm": 0.17909982800483704, + "learning_rate": 8.716763579922204e-05, + "loss": 0.0504, + "step": 403 + }, + { + "epoch": 0.71, + "grad_norm": 0.17002324759960175, + "learning_rate": 8.710521484862439e-05, + "loss": 0.0856, + "step": 404 + }, + { + "epoch": 0.72, + "grad_norm": 0.2229025810956955, + "learning_rate": 8.704266491434788e-05, + "loss": 0.0591, + "step": 405 + }, + { + "epoch": 0.72, + "grad_norm": 0.1444559544324875, + "learning_rate": 8.697998621382607e-05, + "loss": 0.0297, + "step": 406 + }, + { + "epoch": 0.72, + "grad_norm": 0.2677093744277954, + "learning_rate": 8.69171789649402e-05, + "loss": 0.0543, + "step": 407 + }, + { + "epoch": 0.72, + "grad_norm": 0.5875506401062012, + "learning_rate": 8.685424338601834e-05, + "loss": 0.1199, + "step": 408 + }, + { + "epoch": 0.72, + "grad_norm": 0.34576529264450073, + "learning_rate": 8.679117969583464e-05, + "loss": 0.1003, + "step": 409 + }, + { + "epoch": 0.73, + "grad_norm": 0.2765222489833832, + "learning_rate": 8.672798811360863e-05, + "loss": 0.0358, + "step": 410 + }, + { + "epoch": 0.73, + "grad_norm": 0.17154745757579803, + "learning_rate": 8.666466885900438e-05, + "loss": 0.0736, + "step": 411 + }, + { + "epoch": 0.73, + "grad_norm": 0.1607416719198227, + "learning_rate": 8.660122215212977e-05, + "loss": 0.0678, + "step": 412 + }, + { + "epoch": 0.73, + "grad_norm": 0.1216413602232933, + "learning_rate": 8.653764821353573e-05, + "loss": 0.0341, + "step": 413 + }, + { + "epoch": 0.73, + "grad_norm": 0.3713608980178833, + "learning_rate": 8.647394726421547e-05, + "loss": 0.118, + "step": 414 + }, + { + "epoch": 0.73, + "grad_norm": 0.48576387763023376, + "learning_rate": 8.641011952560371e-05, + "loss": 0.0931, + "step": 415 + }, + { + "epoch": 0.74, + "grad_norm": 0.14704179763793945, + "learning_rate": 8.63461652195759e-05, + "loss": 0.0285, + "step": 416 + }, + { + "epoch": 0.74, + "grad_norm": 0.3306657671928406, + "learning_rate": 8.628208456844747e-05, + "loss": 0.0737, + "step": 417 + }, + { + "epoch": 0.74, + "grad_norm": 0.39767885208129883, + "learning_rate": 8.621787779497305e-05, + "loss": 0.097, + "step": 418 + }, + { + "epoch": 0.74, + "grad_norm": 0.3198534846305847, + "learning_rate": 8.615354512234569e-05, + "loss": 0.0731, + "step": 419 + }, + { + "epoch": 0.74, + "grad_norm": 0.19191338121891022, + "learning_rate": 8.608908677419606e-05, + "loss": 0.0697, + "step": 420 + }, + { + "epoch": 0.74, + "grad_norm": 0.26490989327430725, + "learning_rate": 8.602450297459172e-05, + "loss": 0.1214, + "step": 421 + }, + { + "epoch": 0.75, + "grad_norm": 0.3545917868614197, + "learning_rate": 8.595979394803634e-05, + "loss": 0.0933, + "step": 422 + }, + { + "epoch": 0.75, + "grad_norm": 0.28239014744758606, + "learning_rate": 8.589495991946885e-05, + "loss": 0.0707, + "step": 423 + }, + { + "epoch": 0.75, + "grad_norm": 0.48272502422332764, + "learning_rate": 8.583000111426276e-05, + "loss": 0.0831, + "step": 424 + }, + { + "epoch": 0.75, + "grad_norm": 0.15938633680343628, + "learning_rate": 8.576491775822527e-05, + "loss": 0.0899, + "step": 425 + }, + { + "epoch": 0.75, + "grad_norm": 0.2616162896156311, + "learning_rate": 8.569971007759657e-05, + "loss": 0.118, + "step": 426 + }, + { + "epoch": 0.75, + "eval_loss": 0.0783080980181694, + "eval_runtime": 14.6414, + "eval_samples_per_second": 32.579, + "eval_steps_per_second": 8.196, + "step": 426 + }, + { + "epoch": 0.76, + "grad_norm": 0.19521737098693848, + "learning_rate": 8.563437829904903e-05, + "loss": 0.0814, + "step": 427 + }, + { + "epoch": 0.76, + "grad_norm": 0.194011390209198, + "learning_rate": 8.55689226496864e-05, + "loss": 0.0799, + "step": 428 + }, + { + "epoch": 0.76, + "grad_norm": 0.2743787169456482, + "learning_rate": 8.550334335704298e-05, + "loss": 0.0869, + "step": 429 + }, + { + "epoch": 0.76, + "grad_norm": 0.1369010955095291, + "learning_rate": 8.543764064908295e-05, + "loss": 0.0435, + "step": 430 + }, + { + "epoch": 0.76, + "grad_norm": 0.24237819015979767, + "learning_rate": 8.537181475419944e-05, + "loss": 0.1148, + "step": 431 + }, + { + "epoch": 0.76, + "grad_norm": 0.14511409401893616, + "learning_rate": 8.530586590121383e-05, + "loss": 0.0764, + "step": 432 + }, + { + "epoch": 0.77, + "grad_norm": 0.15356196463108063, + "learning_rate": 8.523979431937492e-05, + "loss": 0.05, + "step": 433 + }, + { + "epoch": 0.77, + "grad_norm": 0.21860916912555695, + "learning_rate": 8.51736002383581e-05, + "loss": 0.0971, + "step": 434 + }, + { + "epoch": 0.77, + "grad_norm": 0.23724305629730225, + "learning_rate": 8.510728388826463e-05, + "loss": 0.1049, + "step": 435 + }, + { + "epoch": 0.77, + "grad_norm": 0.25301918387413025, + "learning_rate": 8.50408454996208e-05, + "loss": 0.0848, + "step": 436 + }, + { + "epoch": 0.77, + "grad_norm": 0.22409550845623016, + "learning_rate": 8.497428530337706e-05, + "loss": 0.101, + "step": 437 + }, + { + "epoch": 0.77, + "grad_norm": 0.1324710100889206, + "learning_rate": 8.490760353090737e-05, + "loss": 0.0723, + "step": 438 + }, + { + "epoch": 0.78, + "grad_norm": 0.1362515389919281, + "learning_rate": 8.484080041400826e-05, + "loss": 0.0709, + "step": 439 + }, + { + "epoch": 0.78, + "grad_norm": 0.16375669836997986, + "learning_rate": 8.477387618489807e-05, + "loss": 0.0405, + "step": 440 + }, + { + "epoch": 0.78, + "grad_norm": 0.21752700209617615, + "learning_rate": 8.470683107621616e-05, + "loss": 0.0455, + "step": 441 + }, + { + "epoch": 0.78, + "grad_norm": 0.1989530771970749, + "learning_rate": 8.463966532102207e-05, + "loss": 0.0704, + "step": 442 + }, + { + "epoch": 0.78, + "grad_norm": 0.192123144865036, + "learning_rate": 8.457237915279476e-05, + "loss": 0.063, + "step": 443 + }, + { + "epoch": 0.79, + "grad_norm": 0.1221012994647026, + "learning_rate": 8.450497280543174e-05, + "loss": 0.0302, + "step": 444 + }, + { + "epoch": 0.79, + "grad_norm": 0.5705539584159851, + "learning_rate": 8.443744651324827e-05, + "loss": 0.1531, + "step": 445 + }, + { + "epoch": 0.79, + "grad_norm": 0.21490426361560822, + "learning_rate": 8.436980051097659e-05, + "loss": 0.0626, + "step": 446 + }, + { + "epoch": 0.79, + "grad_norm": 0.2654309570789337, + "learning_rate": 8.430203503376505e-05, + "loss": 0.0838, + "step": 447 + }, + { + "epoch": 0.79, + "grad_norm": 0.22457195818424225, + "learning_rate": 8.423415031717733e-05, + "loss": 0.0309, + "step": 448 + }, + { + "epoch": 0.79, + "grad_norm": 0.10934180021286011, + "learning_rate": 8.416614659719157e-05, + "loss": 0.0132, + "step": 449 + }, + { + "epoch": 0.8, + "grad_norm": 0.32622861862182617, + "learning_rate": 8.409802411019963e-05, + "loss": 0.107, + "step": 450 + }, + { + "epoch": 0.8, + "grad_norm": 0.24298590421676636, + "learning_rate": 8.40297830930062e-05, + "loss": 0.1268, + "step": 451 + }, + { + "epoch": 0.8, + "grad_norm": 0.29994437098503113, + "learning_rate": 8.396142378282798e-05, + "loss": 0.0747, + "step": 452 + }, + { + "epoch": 0.8, + "grad_norm": 0.16668649017810822, + "learning_rate": 8.389294641729293e-05, + "loss": 0.0479, + "step": 453 + }, + { + "epoch": 0.8, + "grad_norm": 0.26706060767173767, + "learning_rate": 8.382435123443934e-05, + "loss": 0.1116, + "step": 454 + }, + { + "epoch": 0.8, + "grad_norm": 0.1750030219554901, + "learning_rate": 8.375563847271506e-05, + "loss": 0.0597, + "step": 455 + }, + { + "epoch": 0.81, + "grad_norm": 0.2318125069141388, + "learning_rate": 8.36868083709767e-05, + "loss": 0.0909, + "step": 456 + }, + { + "epoch": 0.81, + "grad_norm": 0.1834569126367569, + "learning_rate": 8.361786116848872e-05, + "loss": 0.0813, + "step": 457 + }, + { + "epoch": 0.81, + "grad_norm": 0.27685895562171936, + "learning_rate": 8.354879710492264e-05, + "loss": 0.1301, + "step": 458 + }, + { + "epoch": 0.81, + "grad_norm": 0.16120545566082, + "learning_rate": 8.347961642035624e-05, + "loss": 0.0717, + "step": 459 + }, + { + "epoch": 0.81, + "grad_norm": 0.17625439167022705, + "learning_rate": 8.341031935527267e-05, + "loss": 0.0867, + "step": 460 + }, + { + "epoch": 0.82, + "grad_norm": 0.2321135252714157, + "learning_rate": 8.334090615055966e-05, + "loss": 0.1122, + "step": 461 + }, + { + "epoch": 0.82, + "grad_norm": 0.12720270454883575, + "learning_rate": 8.327137704750862e-05, + "loss": 0.0375, + "step": 462 + }, + { + "epoch": 0.82, + "grad_norm": 0.2046743929386139, + "learning_rate": 8.320173228781389e-05, + "loss": 0.0808, + "step": 463 + }, + { + "epoch": 0.82, + "grad_norm": 0.16513489186763763, + "learning_rate": 8.313197211357181e-05, + "loss": 0.0825, + "step": 464 + }, + { + "epoch": 0.82, + "grad_norm": 0.12492749094963074, + "learning_rate": 8.306209676727994e-05, + "loss": 0.0876, + "step": 465 + }, + { + "epoch": 0.82, + "grad_norm": 0.1343008577823639, + "learning_rate": 8.299210649183619e-05, + "loss": 0.0852, + "step": 466 + }, + { + "epoch": 0.83, + "grad_norm": 0.13951613008975983, + "learning_rate": 8.2922001530538e-05, + "loss": 0.1003, + "step": 467 + }, + { + "epoch": 0.83, + "grad_norm": 0.16553768515586853, + "learning_rate": 8.285178212708143e-05, + "loss": 0.0662, + "step": 468 + }, + { + "epoch": 0.83, + "grad_norm": 0.15311822295188904, + "learning_rate": 8.278144852556042e-05, + "loss": 0.0785, + "step": 469 + }, + { + "epoch": 0.83, + "grad_norm": 0.1948017179965973, + "learning_rate": 8.271100097046584e-05, + "loss": 0.0898, + "step": 470 + }, + { + "epoch": 0.83, + "grad_norm": 0.11078551411628723, + "learning_rate": 8.264043970668469e-05, + "loss": 0.0386, + "step": 471 + }, + { + "epoch": 0.84, + "grad_norm": 0.1605585515499115, + "learning_rate": 8.256976497949924e-05, + "loss": 0.0497, + "step": 472 + }, + { + "epoch": 0.84, + "grad_norm": 0.1617887318134308, + "learning_rate": 8.249897703458619e-05, + "loss": 0.0624, + "step": 473 + }, + { + "epoch": 0.84, + "grad_norm": 0.1274091750383377, + "learning_rate": 8.242807611801578e-05, + "loss": 0.0578, + "step": 474 + }, + { + "epoch": 0.84, + "grad_norm": 0.15953154861927032, + "learning_rate": 8.235706247625098e-05, + "loss": 0.042, + "step": 475 + }, + { + "epoch": 0.84, + "grad_norm": 0.27984094619750977, + "learning_rate": 8.228593635614659e-05, + "loss": 0.1037, + "step": 476 + }, + { + "epoch": 0.84, + "grad_norm": 0.1895013153553009, + "learning_rate": 8.22146980049484e-05, + "loss": 0.0728, + "step": 477 + }, + { + "epoch": 0.85, + "grad_norm": 0.1580246388912201, + "learning_rate": 8.214334767029239e-05, + "loss": 0.0398, + "step": 478 + }, + { + "epoch": 0.85, + "grad_norm": 0.2391231805086136, + "learning_rate": 8.207188560020373e-05, + "loss": 0.0707, + "step": 479 + }, + { + "epoch": 0.85, + "grad_norm": 0.25975751876831055, + "learning_rate": 8.200031204309603e-05, + "loss": 0.1369, + "step": 480 + }, + { + "epoch": 0.85, + "grad_norm": 0.32591861486434937, + "learning_rate": 8.192862724777051e-05, + "loss": 0.0878, + "step": 481 + }, + { + "epoch": 0.85, + "grad_norm": 0.32488566637039185, + "learning_rate": 8.185683146341496e-05, + "loss": 0.0692, + "step": 482 + }, + { + "epoch": 0.85, + "grad_norm": 0.1918002963066101, + "learning_rate": 8.178492493960309e-05, + "loss": 0.0942, + "step": 483 + }, + { + "epoch": 0.86, + "grad_norm": 0.43140751123428345, + "learning_rate": 8.171290792629347e-05, + "loss": 0.0979, + "step": 484 + }, + { + "epoch": 0.86, + "grad_norm": 0.1771157830953598, + "learning_rate": 8.164078067382882e-05, + "loss": 0.0894, + "step": 485 + }, + { + "epoch": 0.86, + "grad_norm": 0.25012728571891785, + "learning_rate": 8.1568543432935e-05, + "loss": 0.0734, + "step": 486 + }, + { + "epoch": 0.86, + "grad_norm": 0.19328337907791138, + "learning_rate": 8.149619645472031e-05, + "loss": 0.0869, + "step": 487 + }, + { + "epoch": 0.86, + "grad_norm": 0.18180640041828156, + "learning_rate": 8.142373999067439e-05, + "loss": 0.0897, + "step": 488 + }, + { + "epoch": 0.87, + "grad_norm": 0.25753355026245117, + "learning_rate": 8.135117429266757e-05, + "loss": 0.0883, + "step": 489 + }, + { + "epoch": 0.87, + "grad_norm": 0.23837833106517792, + "learning_rate": 8.127849961294984e-05, + "loss": 0.0549, + "step": 490 + }, + { + "epoch": 0.87, + "grad_norm": 0.25032365322113037, + "learning_rate": 8.120571620415006e-05, + "loss": 0.0976, + "step": 491 + }, + { + "epoch": 0.87, + "grad_norm": 0.30728307366371155, + "learning_rate": 8.113282431927502e-05, + "loss": 0.0709, + "step": 492 + }, + { + "epoch": 0.87, + "grad_norm": 0.1391928791999817, + "learning_rate": 8.10598242117086e-05, + "loss": 0.0378, + "step": 493 + }, + { + "epoch": 0.87, + "grad_norm": 0.1786775141954422, + "learning_rate": 8.098671613521089e-05, + "loss": 0.0671, + "step": 494 + }, + { + "epoch": 0.88, + "grad_norm": 0.21963584423065186, + "learning_rate": 8.091350034391732e-05, + "loss": 0.0936, + "step": 495 + }, + { + "epoch": 0.88, + "grad_norm": 0.13954299688339233, + "learning_rate": 8.084017709233767e-05, + "loss": 0.052, + "step": 496 + }, + { + "epoch": 0.88, + "grad_norm": 0.19656923413276672, + "learning_rate": 8.076674663535537e-05, + "loss": 0.0584, + "step": 497 + }, + { + "epoch": 0.88, + "grad_norm": 0.14330637454986572, + "learning_rate": 8.069320922822643e-05, + "loss": 0.0786, + "step": 498 + }, + { + "epoch": 0.88, + "grad_norm": 0.19019991159439087, + "learning_rate": 8.061956512657871e-05, + "loss": 0.0837, + "step": 499 + }, + { + "epoch": 0.88, + "grad_norm": 0.2079285979270935, + "learning_rate": 8.05458145864109e-05, + "loss": 0.0459, + "step": 500 + }, + { + "epoch": 0.89, + "grad_norm": 0.3516862392425537, + "learning_rate": 8.047195786409172e-05, + "loss": 0.191, + "step": 501 + }, + { + "epoch": 0.89, + "grad_norm": 0.192392036318779, + "learning_rate": 8.039799521635896e-05, + "loss": 0.1072, + "step": 502 + }, + { + "epoch": 0.89, + "grad_norm": 0.1787678301334381, + "learning_rate": 8.032392690031867e-05, + "loss": 0.0649, + "step": 503 + }, + { + "epoch": 0.89, + "grad_norm": 0.21046535670757294, + "learning_rate": 8.024975317344421e-05, + "loss": 0.1065, + "step": 504 + }, + { + "epoch": 0.89, + "grad_norm": 0.1215684562921524, + "learning_rate": 8.017547429357532e-05, + "loss": 0.0433, + "step": 505 + }, + { + "epoch": 0.9, + "grad_norm": 0.1342051476240158, + "learning_rate": 8.010109051891731e-05, + "loss": 0.0774, + "step": 506 + }, + { + "epoch": 0.9, + "grad_norm": 0.10215850174427032, + "learning_rate": 8.002660210804011e-05, + "loss": 0.0338, + "step": 507 + }, + { + "epoch": 0.9, + "grad_norm": 0.23539598286151886, + "learning_rate": 7.995200931987743e-05, + "loss": 0.0516, + "step": 508 + }, + { + "epoch": 0.9, + "grad_norm": 0.15601155161857605, + "learning_rate": 7.987731241372572e-05, + "loss": 0.0559, + "step": 509 + }, + { + "epoch": 0.9, + "grad_norm": 0.165851429104805, + "learning_rate": 7.98025116492434e-05, + "loss": 0.0372, + "step": 510 + }, + { + "epoch": 0.9, + "grad_norm": 0.21045421063899994, + "learning_rate": 7.972760728644996e-05, + "loss": 0.086, + "step": 511 + }, + { + "epoch": 0.91, + "grad_norm": 0.14814500510692596, + "learning_rate": 7.965259958572496e-05, + "loss": 0.0587, + "step": 512 + }, + { + "epoch": 0.91, + "grad_norm": 0.15543898940086365, + "learning_rate": 7.95774888078072e-05, + "loss": 0.0682, + "step": 513 + }, + { + "epoch": 0.91, + "grad_norm": 0.13527697324752808, + "learning_rate": 7.950227521379382e-05, + "loss": 0.0468, + "step": 514 + }, + { + "epoch": 0.91, + "grad_norm": 0.35773295164108276, + "learning_rate": 7.94269590651393e-05, + "loss": 0.1273, + "step": 515 + }, + { + "epoch": 0.91, + "grad_norm": 0.22433511912822723, + "learning_rate": 7.935154062365467e-05, + "loss": 0.0438, + "step": 516 + }, + { + "epoch": 0.91, + "grad_norm": 0.1453983038663864, + "learning_rate": 7.927602015150655e-05, + "loss": 0.0367, + "step": 517 + }, + { + "epoch": 0.92, + "grad_norm": 0.2343645691871643, + "learning_rate": 7.920039791121617e-05, + "loss": 0.128, + "step": 518 + }, + { + "epoch": 0.92, + "grad_norm": 0.25178173184394836, + "learning_rate": 7.912467416565861e-05, + "loss": 0.1094, + "step": 519 + }, + { + "epoch": 0.92, + "grad_norm": 0.3170076012611389, + "learning_rate": 7.904884917806174e-05, + "loss": 0.1323, + "step": 520 + }, + { + "epoch": 0.92, + "grad_norm": 0.15917453169822693, + "learning_rate": 7.897292321200538e-05, + "loss": 0.036, + "step": 521 + }, + { + "epoch": 0.92, + "grad_norm": 0.31578320264816284, + "learning_rate": 7.889689653142036e-05, + "loss": 0.0909, + "step": 522 + }, + { + "epoch": 0.93, + "grad_norm": 0.16602741181850433, + "learning_rate": 7.882076940058764e-05, + "loss": 0.0371, + "step": 523 + }, + { + "epoch": 0.93, + "grad_norm": 0.3235325813293457, + "learning_rate": 7.874454208413731e-05, + "loss": 0.1561, + "step": 524 + }, + { + "epoch": 0.93, + "grad_norm": 0.1256486028432846, + "learning_rate": 7.866821484704776e-05, + "loss": 0.0364, + "step": 525 + }, + { + "epoch": 0.93, + "grad_norm": 0.2234162986278534, + "learning_rate": 7.859178795464472e-05, + "loss": 0.0883, + "step": 526 + }, + { + "epoch": 0.93, + "grad_norm": 0.1564294993877411, + "learning_rate": 7.851526167260034e-05, + "loss": 0.0679, + "step": 527 + }, + { + "epoch": 0.93, + "grad_norm": 0.16309525072574615, + "learning_rate": 7.84386362669322e-05, + "loss": 0.0912, + "step": 528 + }, + { + "epoch": 0.94, + "grad_norm": 0.21584004163742065, + "learning_rate": 7.836191200400255e-05, + "loss": 0.0695, + "step": 529 + }, + { + "epoch": 0.94, + "grad_norm": 0.15948422253131866, + "learning_rate": 7.828508915051724e-05, + "loss": 0.0459, + "step": 530 + }, + { + "epoch": 0.94, + "grad_norm": 0.24016940593719482, + "learning_rate": 7.82081679735248e-05, + "loss": 0.1127, + "step": 531 + }, + { + "epoch": 0.94, + "grad_norm": 0.2894397974014282, + "learning_rate": 7.813114874041557e-05, + "loss": 0.0584, + "step": 532 + }, + { + "epoch": 0.94, + "grad_norm": 0.20707662403583527, + "learning_rate": 7.805403171892079e-05, + "loss": 0.1045, + "step": 533 + }, + { + "epoch": 0.94, + "grad_norm": 0.23427248001098633, + "learning_rate": 7.797681717711161e-05, + "loss": 0.1345, + "step": 534 + }, + { + "epoch": 0.95, + "grad_norm": 0.13141866028308868, + "learning_rate": 7.789950538339812e-05, + "loss": 0.052, + "step": 535 + }, + { + "epoch": 0.95, + "grad_norm": 0.21118536591529846, + "learning_rate": 7.782209660652855e-05, + "loss": 0.1272, + "step": 536 + }, + { + "epoch": 0.95, + "grad_norm": 0.15485352277755737, + "learning_rate": 7.77445911155882e-05, + "loss": 0.0686, + "step": 537 + }, + { + "epoch": 0.95, + "grad_norm": 0.11380946636199951, + "learning_rate": 7.766698917999861e-05, + "loss": 0.0735, + "step": 538 + }, + { + "epoch": 0.95, + "grad_norm": 0.26798170804977417, + "learning_rate": 7.758929106951656e-05, + "loss": 0.0934, + "step": 539 + }, + { + "epoch": 0.96, + "grad_norm": 0.23003587126731873, + "learning_rate": 7.751149705423312e-05, + "loss": 0.0816, + "step": 540 + }, + { + "epoch": 0.96, + "grad_norm": 0.2122953236103058, + "learning_rate": 7.743360740457278e-05, + "loss": 0.0827, + "step": 541 + }, + { + "epoch": 0.96, + "grad_norm": 0.22673499584197998, + "learning_rate": 7.735562239129247e-05, + "loss": 0.1232, + "step": 542 + }, + { + "epoch": 0.96, + "grad_norm": 0.24960415065288544, + "learning_rate": 7.727754228548058e-05, + "loss": 0.1124, + "step": 543 + }, + { + "epoch": 0.96, + "grad_norm": 0.10405872017145157, + "learning_rate": 7.719936735855611e-05, + "loss": 0.0687, + "step": 544 + }, + { + "epoch": 0.96, + "grad_norm": 0.16980154812335968, + "learning_rate": 7.712109788226762e-05, + "loss": 0.0874, + "step": 545 + }, + { + "epoch": 0.97, + "grad_norm": 0.1486412137746811, + "learning_rate": 7.704273412869238e-05, + "loss": 0.0815, + "step": 546 + }, + { + "epoch": 0.97, + "grad_norm": 0.14432762563228607, + "learning_rate": 7.696427637023538e-05, + "loss": 0.0752, + "step": 547 + }, + { + "epoch": 0.97, + "grad_norm": 0.2627028822898865, + "learning_rate": 7.688572487962835e-05, + "loss": 0.0982, + "step": 548 + }, + { + "epoch": 0.97, + "grad_norm": 0.16832011938095093, + "learning_rate": 7.680707992992888e-05, + "loss": 0.0895, + "step": 549 + }, + { + "epoch": 0.97, + "grad_norm": 0.14999301731586456, + "learning_rate": 7.672834179451942e-05, + "loss": 0.0544, + "step": 550 + }, + { + "epoch": 0.97, + "grad_norm": 0.14237482845783234, + "learning_rate": 7.664951074710638e-05, + "loss": 0.0623, + "step": 551 + }, + { + "epoch": 0.98, + "grad_norm": 0.1694159060716629, + "learning_rate": 7.657058706171911e-05, + "loss": 0.0784, + "step": 552 + }, + { + "epoch": 0.98, + "grad_norm": 0.1470886617898941, + "learning_rate": 7.649157101270902e-05, + "loss": 0.0635, + "step": 553 + }, + { + "epoch": 0.98, + "grad_norm": 0.16492018103599548, + "learning_rate": 7.641246287474855e-05, + "loss": 0.0669, + "step": 554 + }, + { + "epoch": 0.98, + "grad_norm": 0.195392444729805, + "learning_rate": 7.633326292283028e-05, + "loss": 0.0387, + "step": 555 + }, + { + "epoch": 0.98, + "grad_norm": 0.17653177678585052, + "learning_rate": 7.625397143226596e-05, + "loss": 0.0592, + "step": 556 + }, + { + "epoch": 0.99, + "grad_norm": 0.23455718159675598, + "learning_rate": 7.617458867868553e-05, + "loss": 0.0882, + "step": 557 + }, + { + "epoch": 0.99, + "grad_norm": 0.3588998317718506, + "learning_rate": 7.609511493803616e-05, + "loss": 0.107, + "step": 558 + }, + { + "epoch": 0.99, + "grad_norm": 0.2767946720123291, + "learning_rate": 7.601555048658134e-05, + "loss": 0.1609, + "step": 559 + }, + { + "epoch": 0.99, + "grad_norm": 0.22181196510791779, + "learning_rate": 7.593589560089985e-05, + "loss": 0.0598, + "step": 560 + }, + { + "epoch": 0.99, + "grad_norm": 0.30335313081741333, + "learning_rate": 7.585615055788484e-05, + "loss": 0.0825, + "step": 561 + }, + { + "epoch": 0.99, + "grad_norm": 0.19477833807468414, + "learning_rate": 7.577631563474291e-05, + "loss": 0.0446, + "step": 562 + }, + { + "epoch": 1.0, + "grad_norm": 0.11036123335361481, + "learning_rate": 7.569639110899303e-05, + "loss": 0.025, + "step": 563 + }, + { + "epoch": 1.0, + "grad_norm": 0.19955220818519592, + "learning_rate": 7.561637725846568e-05, + "loss": 0.0484, + "step": 564 + }, + { + "epoch": 1.0, + "grad_norm": 0.20293684303760529, + "learning_rate": 7.553627436130183e-05, + "loss": 0.0689, + "step": 565 + }, + { + "epoch": 1.0, + "grad_norm": 0.18100765347480774, + "learning_rate": 7.545608269595202e-05, + "loss": 0.0371, + "step": 566 + }, + { + "epoch": 1.0, + "grad_norm": 0.43053922057151794, + "learning_rate": 7.537580254117531e-05, + "loss": 0.0901, + "step": 567 + }, + { + "epoch": 1.0, + "grad_norm": 0.3413926959037781, + "learning_rate": 7.529543417603844e-05, + "loss": 0.1088, + "step": 568 + }, + { + "epoch": 1.0, + "eval_loss": 0.09266742318868637, + "eval_runtime": 14.642, + "eval_samples_per_second": 32.578, + "eval_steps_per_second": 8.196, + "step": 568 + }, + { + "epoch": 1.01, + "grad_norm": 0.1796027272939682, + "learning_rate": 7.521497787991471e-05, + "loss": 0.0244, + "step": 569 + }, + { + "epoch": 1.01, + "grad_norm": 0.30515041947364807, + "learning_rate": 7.513443393248312e-05, + "loss": 0.0682, + "step": 570 + }, + { + "epoch": 1.01, + "grad_norm": 0.3712550103664398, + "learning_rate": 7.505380261372734e-05, + "loss": 0.0921, + "step": 571 + }, + { + "epoch": 1.01, + "grad_norm": 0.4219339191913605, + "learning_rate": 7.497308420393477e-05, + "loss": 0.0785, + "step": 572 + }, + { + "epoch": 1.01, + "grad_norm": 0.24129725992679596, + "learning_rate": 7.489227898369559e-05, + "loss": 0.0851, + "step": 573 + }, + { + "epoch": 1.02, + "grad_norm": 0.24595998227596283, + "learning_rate": 7.481138723390164e-05, + "loss": 0.1143, + "step": 574 + }, + { + "epoch": 1.02, + "grad_norm": 0.13906948268413544, + "learning_rate": 7.473040923574567e-05, + "loss": 0.0402, + "step": 575 + }, + { + "epoch": 1.02, + "grad_norm": 0.1885530650615692, + "learning_rate": 7.464934527072016e-05, + "loss": 0.0384, + "step": 576 + }, + { + "epoch": 1.02, + "grad_norm": 0.13116823136806488, + "learning_rate": 7.456819562061649e-05, + "loss": 0.0447, + "step": 577 + }, + { + "epoch": 1.02, + "grad_norm": 0.23953841626644135, + "learning_rate": 7.448696056752383e-05, + "loss": 0.0602, + "step": 578 + }, + { + "epoch": 1.02, + "grad_norm": 0.17374739050865173, + "learning_rate": 7.440564039382827e-05, + "loss": 0.0657, + "step": 579 + }, + { + "epoch": 1.03, + "grad_norm": 0.20921552181243896, + "learning_rate": 7.432423538221178e-05, + "loss": 0.0757, + "step": 580 + }, + { + "epoch": 1.03, + "grad_norm": 0.10258325189352036, + "learning_rate": 7.424274581565123e-05, + "loss": 0.0237, + "step": 581 + }, + { + "epoch": 1.03, + "grad_norm": 0.31752172112464905, + "learning_rate": 7.416117197741742e-05, + "loss": 0.0625, + "step": 582 + }, + { + "epoch": 1.03, + "grad_norm": 0.229179248213768, + "learning_rate": 7.407951415107413e-05, + "loss": 0.0792, + "step": 583 + }, + { + "epoch": 1.03, + "grad_norm": 0.16059361398220062, + "learning_rate": 7.3997772620477e-05, + "loss": 0.0718, + "step": 584 + }, + { + "epoch": 1.03, + "grad_norm": 0.1626499593257904, + "learning_rate": 7.391594766977277e-05, + "loss": 0.0457, + "step": 585 + }, + { + "epoch": 1.04, + "grad_norm": 0.1549261212348938, + "learning_rate": 7.383403958339807e-05, + "loss": 0.0544, + "step": 586 + }, + { + "epoch": 1.04, + "grad_norm": 0.1588374525308609, + "learning_rate": 7.375204864607852e-05, + "loss": 0.0342, + "step": 587 + }, + { + "epoch": 1.04, + "grad_norm": 0.09883646667003632, + "learning_rate": 7.366997514282782e-05, + "loss": 0.0292, + "step": 588 + }, + { + "epoch": 1.04, + "grad_norm": 0.3421178460121155, + "learning_rate": 7.358781935894659e-05, + "loss": 0.0999, + "step": 589 + }, + { + "epoch": 1.04, + "grad_norm": 0.12105683982372284, + "learning_rate": 7.350558158002154e-05, + "loss": 0.023, + "step": 590 + }, + { + "epoch": 1.05, + "grad_norm": 0.15255074203014374, + "learning_rate": 7.342326209192435e-05, + "loss": 0.0423, + "step": 591 + }, + { + "epoch": 1.05, + "grad_norm": 0.18337713181972504, + "learning_rate": 7.33408611808108e-05, + "loss": 0.0305, + "step": 592 + }, + { + "epoch": 1.05, + "grad_norm": 0.12969495356082916, + "learning_rate": 7.325837913311966e-05, + "loss": 0.0175, + "step": 593 + }, + { + "epoch": 1.05, + "grad_norm": 0.2849477231502533, + "learning_rate": 7.317581623557177e-05, + "loss": 0.0878, + "step": 594 + }, + { + "epoch": 1.05, + "grad_norm": 0.24307942390441895, + "learning_rate": 7.3093172775169e-05, + "loss": 0.037, + "step": 595 + }, + { + "epoch": 1.05, + "grad_norm": 0.2612784206867218, + "learning_rate": 7.301044903919325e-05, + "loss": 0.097, + "step": 596 + }, + { + "epoch": 1.06, + "grad_norm": 0.2548207640647888, + "learning_rate": 7.292764531520553e-05, + "loss": 0.0928, + "step": 597 + }, + { + "epoch": 1.06, + "grad_norm": 0.3221377432346344, + "learning_rate": 7.284476189104485e-05, + "loss": 0.0874, + "step": 598 + }, + { + "epoch": 1.06, + "grad_norm": 0.20441681146621704, + "learning_rate": 7.27617990548273e-05, + "loss": 0.035, + "step": 599 + }, + { + "epoch": 1.06, + "grad_norm": 0.26378926634788513, + "learning_rate": 7.267875709494499e-05, + "loss": 0.0494, + "step": 600 + }, + { + "epoch": 1.06, + "grad_norm": 0.5505862832069397, + "learning_rate": 7.259563630006512e-05, + "loss": 0.1241, + "step": 601 + }, + { + "epoch": 1.07, + "grad_norm": 0.13375498354434967, + "learning_rate": 7.251243695912886e-05, + "loss": 0.0241, + "step": 602 + }, + { + "epoch": 1.07, + "grad_norm": 0.19569019973278046, + "learning_rate": 7.242915936135051e-05, + "loss": 0.0698, + "step": 603 + }, + { + "epoch": 1.07, + "grad_norm": 0.28432735800743103, + "learning_rate": 7.234580379621637e-05, + "loss": 0.0641, + "step": 604 + }, + { + "epoch": 1.07, + "grad_norm": 0.13998962938785553, + "learning_rate": 7.22623705534837e-05, + "loss": 0.0365, + "step": 605 + }, + { + "epoch": 1.07, + "grad_norm": 0.1772097498178482, + "learning_rate": 7.217885992317985e-05, + "loss": 0.081, + "step": 606 + }, + { + "epoch": 1.07, + "grad_norm": 0.4318295121192932, + "learning_rate": 7.209527219560119e-05, + "loss": 0.0532, + "step": 607 + }, + { + "epoch": 1.08, + "grad_norm": 0.2505156695842743, + "learning_rate": 7.201160766131207e-05, + "loss": 0.0667, + "step": 608 + }, + { + "epoch": 1.08, + "grad_norm": 0.13024090230464935, + "learning_rate": 7.192786661114384e-05, + "loss": 0.0234, + "step": 609 + }, + { + "epoch": 1.08, + "grad_norm": 0.2824789583683014, + "learning_rate": 7.184404933619377e-05, + "loss": 0.095, + "step": 610 + }, + { + "epoch": 1.08, + "grad_norm": 0.18059489130973816, + "learning_rate": 7.17601561278242e-05, + "loss": 0.0471, + "step": 611 + }, + { + "epoch": 1.08, + "grad_norm": 0.2839769124984741, + "learning_rate": 7.167618727766138e-05, + "loss": 0.0783, + "step": 612 + }, + { + "epoch": 1.08, + "grad_norm": 0.1342955082654953, + "learning_rate": 7.159214307759448e-05, + "loss": 0.0453, + "step": 613 + }, + { + "epoch": 1.09, + "grad_norm": 0.1336507499217987, + "learning_rate": 7.150802381977464e-05, + "loss": 0.0431, + "step": 614 + }, + { + "epoch": 1.09, + "grad_norm": 0.2953212559223175, + "learning_rate": 7.142382979661386e-05, + "loss": 0.0705, + "step": 615 + }, + { + "epoch": 1.09, + "grad_norm": 0.17532870173454285, + "learning_rate": 7.133956130078412e-05, + "loss": 0.0666, + "step": 616 + }, + { + "epoch": 1.09, + "grad_norm": 0.17404836416244507, + "learning_rate": 7.12552186252162e-05, + "loss": 0.0522, + "step": 617 + }, + { + "epoch": 1.09, + "grad_norm": 0.25231000781059265, + "learning_rate": 7.117080206309878e-05, + "loss": 0.0854, + "step": 618 + }, + { + "epoch": 1.1, + "grad_norm": 0.2264215499162674, + "learning_rate": 7.108631190787735e-05, + "loss": 0.0692, + "step": 619 + }, + { + "epoch": 1.1, + "grad_norm": 0.3555202782154083, + "learning_rate": 7.100174845325327e-05, + "loss": 0.074, + "step": 620 + }, + { + "epoch": 1.1, + "grad_norm": 0.34550729393959045, + "learning_rate": 7.091711199318264e-05, + "loss": 0.0831, + "step": 621 + }, + { + "epoch": 1.1, + "grad_norm": 0.14560338854789734, + "learning_rate": 7.083240282187543e-05, + "loss": 0.0404, + "step": 622 + }, + { + "epoch": 1.1, + "grad_norm": 0.23464788496494293, + "learning_rate": 7.074762123379423e-05, + "loss": 0.0699, + "step": 623 + }, + { + "epoch": 1.1, + "grad_norm": 0.22587832808494568, + "learning_rate": 7.066276752365352e-05, + "loss": 0.0887, + "step": 624 + }, + { + "epoch": 1.11, + "grad_norm": 0.17183855175971985, + "learning_rate": 7.057784198641834e-05, + "loss": 0.0373, + "step": 625 + }, + { + "epoch": 1.11, + "grad_norm": 0.19148162007331848, + "learning_rate": 7.049284491730354e-05, + "loss": 0.0289, + "step": 626 + }, + { + "epoch": 1.11, + "grad_norm": 0.26134082674980164, + "learning_rate": 7.040777661177251e-05, + "loss": 0.0367, + "step": 627 + }, + { + "epoch": 1.11, + "grad_norm": 0.5379131436347961, + "learning_rate": 7.032263736553635e-05, + "loss": 0.1049, + "step": 628 + }, + { + "epoch": 1.11, + "grad_norm": 0.13634662330150604, + "learning_rate": 7.023742747455276e-05, + "loss": 0.018, + "step": 629 + }, + { + "epoch": 1.11, + "grad_norm": 0.28767991065979004, + "learning_rate": 7.015214723502496e-05, + "loss": 0.06, + "step": 630 + }, + { + "epoch": 1.12, + "grad_norm": 0.2551933825016022, + "learning_rate": 7.006679694340073e-05, + "loss": 0.0407, + "step": 631 + }, + { + "epoch": 1.12, + "grad_norm": 0.41325151920318604, + "learning_rate": 6.998137689637142e-05, + "loss": 0.046, + "step": 632 + }, + { + "epoch": 1.12, + "grad_norm": 0.25655174255371094, + "learning_rate": 6.989588739087078e-05, + "loss": 0.0398, + "step": 633 + }, + { + "epoch": 1.12, + "grad_norm": 0.3917771279811859, + "learning_rate": 6.981032872407405e-05, + "loss": 0.1072, + "step": 634 + }, + { + "epoch": 1.12, + "grad_norm": 0.20738206803798676, + "learning_rate": 6.972470119339691e-05, + "loss": 0.0457, + "step": 635 + }, + { + "epoch": 1.13, + "grad_norm": 0.1865154653787613, + "learning_rate": 6.963900509649434e-05, + "loss": 0.0258, + "step": 636 + }, + { + "epoch": 1.13, + "grad_norm": 0.282071590423584, + "learning_rate": 6.955324073125979e-05, + "loss": 0.07, + "step": 637 + }, + { + "epoch": 1.13, + "grad_norm": 0.27442115545272827, + "learning_rate": 6.946740839582388e-05, + "loss": 0.0875, + "step": 638 + }, + { + "epoch": 1.13, + "grad_norm": 0.2635151445865631, + "learning_rate": 6.938150838855359e-05, + "loss": 0.0332, + "step": 639 + }, + { + "epoch": 1.13, + "grad_norm": 0.16783182322978973, + "learning_rate": 6.929554100805118e-05, + "loss": 0.0405, + "step": 640 + }, + { + "epoch": 1.13, + "grad_norm": 0.3328685760498047, + "learning_rate": 6.920950655315297e-05, + "loss": 0.1076, + "step": 641 + }, + { + "epoch": 1.14, + "grad_norm": 0.20146729052066803, + "learning_rate": 6.91234053229286e-05, + "loss": 0.0481, + "step": 642 + }, + { + "epoch": 1.14, + "grad_norm": 0.21599121391773224, + "learning_rate": 6.903723761667973e-05, + "loss": 0.0502, + "step": 643 + }, + { + "epoch": 1.14, + "grad_norm": 0.16269706189632416, + "learning_rate": 6.895100373393913e-05, + "loss": 0.0652, + "step": 644 + }, + { + "epoch": 1.14, + "grad_norm": 0.3716180622577667, + "learning_rate": 6.886470397446958e-05, + "loss": 0.0914, + "step": 645 + }, + { + "epoch": 1.14, + "grad_norm": 0.18003414571285248, + "learning_rate": 6.877833863826295e-05, + "loss": 0.0484, + "step": 646 + }, + { + "epoch": 1.14, + "grad_norm": 0.2745915949344635, + "learning_rate": 6.869190802553894e-05, + "loss": 0.1057, + "step": 647 + }, + { + "epoch": 1.15, + "grad_norm": 0.2507147490978241, + "learning_rate": 6.860541243674426e-05, + "loss": 0.0587, + "step": 648 + }, + { + "epoch": 1.15, + "grad_norm": 0.19874247908592224, + "learning_rate": 6.851885217255145e-05, + "loss": 0.0452, + "step": 649 + }, + { + "epoch": 1.15, + "grad_norm": 0.21256215870380402, + "learning_rate": 6.843222753385786e-05, + "loss": 0.0434, + "step": 650 + }, + { + "epoch": 1.15, + "grad_norm": 0.13569054007530212, + "learning_rate": 6.834553882178463e-05, + "loss": 0.0275, + "step": 651 + }, + { + "epoch": 1.15, + "grad_norm": 0.26869267225265503, + "learning_rate": 6.825878633767563e-05, + "loss": 0.1006, + "step": 652 + }, + { + "epoch": 1.16, + "grad_norm": 0.21900776028633118, + "learning_rate": 6.817197038309644e-05, + "loss": 0.0564, + "step": 653 + }, + { + "epoch": 1.16, + "grad_norm": 0.13306765258312225, + "learning_rate": 6.80850912598332e-05, + "loss": 0.0304, + "step": 654 + }, + { + "epoch": 1.16, + "grad_norm": 0.09863998740911484, + "learning_rate": 6.79981492698917e-05, + "loss": 0.0178, + "step": 655 + }, + { + "epoch": 1.16, + "grad_norm": 0.2170545756816864, + "learning_rate": 6.791114471549627e-05, + "loss": 0.0754, + "step": 656 + }, + { + "epoch": 1.16, + "grad_norm": 0.26661446690559387, + "learning_rate": 6.782407789908863e-05, + "loss": 0.1083, + "step": 657 + }, + { + "epoch": 1.16, + "grad_norm": 0.2747049629688263, + "learning_rate": 6.773694912332707e-05, + "loss": 0.0758, + "step": 658 + }, + { + "epoch": 1.17, + "grad_norm": 0.252560019493103, + "learning_rate": 6.764975869108514e-05, + "loss": 0.0681, + "step": 659 + }, + { + "epoch": 1.17, + "grad_norm": 0.2867240905761719, + "learning_rate": 6.756250690545079e-05, + "loss": 0.095, + "step": 660 + }, + { + "epoch": 1.17, + "grad_norm": 0.09760677814483643, + "learning_rate": 6.747519406972524e-05, + "loss": 0.0123, + "step": 661 + }, + { + "epoch": 1.17, + "grad_norm": 0.17589041590690613, + "learning_rate": 6.738782048742187e-05, + "loss": 0.0437, + "step": 662 + }, + { + "epoch": 1.17, + "grad_norm": 0.29266613721847534, + "learning_rate": 6.730038646226532e-05, + "loss": 0.0706, + "step": 663 + }, + { + "epoch": 1.17, + "grad_norm": 0.1055804044008255, + "learning_rate": 6.721289229819024e-05, + "loss": 0.0343, + "step": 664 + }, + { + "epoch": 1.18, + "grad_norm": 0.2909635305404663, + "learning_rate": 6.712533829934042e-05, + "loss": 0.0817, + "step": 665 + }, + { + "epoch": 1.18, + "grad_norm": 0.2604895532131195, + "learning_rate": 6.703772477006757e-05, + "loss": 0.0452, + "step": 666 + }, + { + "epoch": 1.18, + "grad_norm": 0.10520771890878677, + "learning_rate": 6.695005201493038e-05, + "loss": 0.0215, + "step": 667 + }, + { + "epoch": 1.18, + "grad_norm": 0.10080817341804504, + "learning_rate": 6.686232033869344e-05, + "loss": 0.0188, + "step": 668 + }, + { + "epoch": 1.18, + "grad_norm": 0.3340647220611572, + "learning_rate": 6.677453004632608e-05, + "loss": 0.0612, + "step": 669 + }, + { + "epoch": 1.19, + "grad_norm": 0.29719796776771545, + "learning_rate": 6.668668144300149e-05, + "loss": 0.1014, + "step": 670 + }, + { + "epoch": 1.19, + "grad_norm": 0.2131602168083191, + "learning_rate": 6.659877483409545e-05, + "loss": 0.0621, + "step": 671 + }, + { + "epoch": 1.19, + "grad_norm": 0.1867963820695877, + "learning_rate": 6.65108105251855e-05, + "loss": 0.0312, + "step": 672 + }, + { + "epoch": 1.19, + "grad_norm": 0.4250008463859558, + "learning_rate": 6.642278882204963e-05, + "loss": 0.0684, + "step": 673 + }, + { + "epoch": 1.19, + "grad_norm": 0.20828047394752502, + "learning_rate": 6.633471003066543e-05, + "loss": 0.0421, + "step": 674 + }, + { + "epoch": 1.19, + "grad_norm": 0.23356445133686066, + "learning_rate": 6.62465744572089e-05, + "loss": 0.0277, + "step": 675 + }, + { + "epoch": 1.2, + "grad_norm": 0.42427390813827515, + "learning_rate": 6.615838240805344e-05, + "loss": 0.0745, + "step": 676 + }, + { + "epoch": 1.2, + "grad_norm": 0.23298533260822296, + "learning_rate": 6.607013418976874e-05, + "loss": 0.047, + "step": 677 + }, + { + "epoch": 1.2, + "grad_norm": 0.5681192278862, + "learning_rate": 6.598183010911978e-05, + "loss": 0.1032, + "step": 678 + }, + { + "epoch": 1.2, + "grad_norm": 0.15370431542396545, + "learning_rate": 6.589347047306571e-05, + "loss": 0.0224, + "step": 679 + }, + { + "epoch": 1.2, + "grad_norm": 0.2974132001399994, + "learning_rate": 6.580505558875877e-05, + "loss": 0.0908, + "step": 680 + }, + { + "epoch": 1.2, + "grad_norm": 0.12158460170030594, + "learning_rate": 6.571658576354333e-05, + "loss": 0.0212, + "step": 681 + }, + { + "epoch": 1.21, + "grad_norm": 0.32594335079193115, + "learning_rate": 6.562806130495467e-05, + "loss": 0.1016, + "step": 682 + }, + { + "epoch": 1.21, + "grad_norm": 0.3316996097564697, + "learning_rate": 6.5539482520718e-05, + "loss": 0.0639, + "step": 683 + }, + { + "epoch": 1.21, + "grad_norm": 0.21660655736923218, + "learning_rate": 6.545084971874738e-05, + "loss": 0.043, + "step": 684 + }, + { + "epoch": 1.21, + "grad_norm": 0.350033164024353, + "learning_rate": 6.536216320714466e-05, + "loss": 0.0752, + "step": 685 + }, + { + "epoch": 1.21, + "grad_norm": 0.30745336413383484, + "learning_rate": 6.527342329419837e-05, + "loss": 0.0927, + "step": 686 + }, + { + "epoch": 1.22, + "grad_norm": 0.24984771013259888, + "learning_rate": 6.51846302883827e-05, + "loss": 0.0685, + "step": 687 + }, + { + "epoch": 1.22, + "grad_norm": 0.07773179560899734, + "learning_rate": 6.509578449835636e-05, + "loss": 0.0152, + "step": 688 + }, + { + "epoch": 1.22, + "grad_norm": 0.1620987057685852, + "learning_rate": 6.500688623296159e-05, + "loss": 0.0514, + "step": 689 + }, + { + "epoch": 1.22, + "grad_norm": 0.1917831003665924, + "learning_rate": 6.491793580122301e-05, + "loss": 0.066, + "step": 690 + }, + { + "epoch": 1.22, + "grad_norm": 0.21920029819011688, + "learning_rate": 6.482893351234658e-05, + "loss": 0.0547, + "step": 691 + }, + { + "epoch": 1.22, + "grad_norm": 0.29076483845710754, + "learning_rate": 6.473987967571856e-05, + "loss": 0.079, + "step": 692 + }, + { + "epoch": 1.23, + "grad_norm": 0.30292215943336487, + "learning_rate": 6.46507746009043e-05, + "loss": 0.0957, + "step": 693 + }, + { + "epoch": 1.23, + "grad_norm": 0.14139439165592194, + "learning_rate": 6.456161859764744e-05, + "loss": 0.0346, + "step": 694 + }, + { + "epoch": 1.23, + "grad_norm": 0.22850438952445984, + "learning_rate": 6.447241197586847e-05, + "loss": 0.0744, + "step": 695 + }, + { + "epoch": 1.23, + "grad_norm": 0.48915836215019226, + "learning_rate": 6.438315504566397e-05, + "loss": 0.0953, + "step": 696 + }, + { + "epoch": 1.23, + "grad_norm": 0.17644958198070526, + "learning_rate": 6.429384811730528e-05, + "loss": 0.046, + "step": 697 + }, + { + "epoch": 1.23, + "grad_norm": 0.2039819210767746, + "learning_rate": 6.420449150123767e-05, + "loss": 0.1052, + "step": 698 + }, + { + "epoch": 1.24, + "grad_norm": 0.17715586721897125, + "learning_rate": 6.411508550807906e-05, + "loss": 0.0447, + "step": 699 + }, + { + "epoch": 1.24, + "grad_norm": 0.16100600361824036, + "learning_rate": 6.4025630448619e-05, + "loss": 0.0344, + "step": 700 + }, + { + "epoch": 1.24, + "grad_norm": 0.22480256855487823, + "learning_rate": 6.393612663381763e-05, + "loss": 0.0495, + "step": 701 + }, + { + "epoch": 1.24, + "grad_norm": 0.12992677092552185, + "learning_rate": 6.384657437480458e-05, + "loss": 0.0409, + "step": 702 + }, + { + "epoch": 1.24, + "grad_norm": 0.1325366348028183, + "learning_rate": 6.375697398287787e-05, + "loss": 0.0257, + "step": 703 + }, + { + "epoch": 1.25, + "grad_norm": 0.16241514682769775, + "learning_rate": 6.366732576950284e-05, + "loss": 0.0427, + "step": 704 + }, + { + "epoch": 1.25, + "grad_norm": 0.21476183831691742, + "learning_rate": 6.357763004631104e-05, + "loss": 0.0451, + "step": 705 + }, + { + "epoch": 1.25, + "grad_norm": 0.32039332389831543, + "learning_rate": 6.34878871250992e-05, + "loss": 0.0545, + "step": 706 + }, + { + "epoch": 1.25, + "grad_norm": 0.3203076124191284, + "learning_rate": 6.33980973178281e-05, + "loss": 0.0917, + "step": 707 + }, + { + "epoch": 1.25, + "grad_norm": 0.25006967782974243, + "learning_rate": 6.330826093662156e-05, + "loss": 0.1028, + "step": 708 + }, + { + "epoch": 1.25, + "grad_norm": 0.52630215883255, + "learning_rate": 6.32183782937652e-05, + "loss": 0.0889, + "step": 709 + }, + { + "epoch": 1.26, + "grad_norm": 0.33741331100463867, + "learning_rate": 6.31284497017055e-05, + "loss": 0.0725, + "step": 710 + }, + { + "epoch": 1.26, + "eval_loss": 0.07285241782665253, + "eval_runtime": 14.6756, + "eval_samples_per_second": 32.503, + "eval_steps_per_second": 8.177, + "step": 710 + }, + { + "epoch": 1.26, + "grad_norm": 0.40746867656707764, + "learning_rate": 6.303847547304873e-05, + "loss": 0.0945, + "step": 711 + }, + { + "epoch": 1.26, + "grad_norm": 0.22757941484451294, + "learning_rate": 6.294845592055967e-05, + "loss": 0.0532, + "step": 712 + }, + { + "epoch": 1.26, + "grad_norm": 0.19006334245204926, + "learning_rate": 6.285839135716079e-05, + "loss": 0.0484, + "step": 713 + }, + { + "epoch": 1.26, + "grad_norm": 0.48126357793807983, + "learning_rate": 6.27682820959309e-05, + "loss": 0.0967, + "step": 714 + }, + { + "epoch": 1.26, + "grad_norm": 0.23766569793224335, + "learning_rate": 6.26781284501043e-05, + "loss": 0.106, + "step": 715 + }, + { + "epoch": 1.27, + "grad_norm": 0.16818860173225403, + "learning_rate": 6.258793073306949e-05, + "loss": 0.0494, + "step": 716 + }, + { + "epoch": 1.27, + "grad_norm": 0.28579115867614746, + "learning_rate": 6.249768925836822e-05, + "loss": 0.0937, + "step": 717 + }, + { + "epoch": 1.27, + "grad_norm": 0.16623319685459137, + "learning_rate": 6.240740433969432e-05, + "loss": 0.0301, + "step": 718 + }, + { + "epoch": 1.27, + "grad_norm": 0.1560198813676834, + "learning_rate": 6.231707629089262e-05, + "loss": 0.0384, + "step": 719 + }, + { + "epoch": 1.27, + "grad_norm": 0.2002251148223877, + "learning_rate": 6.2226705425958e-05, + "loss": 0.0545, + "step": 720 + }, + { + "epoch": 1.28, + "grad_norm": 0.16213096678256989, + "learning_rate": 6.2136292059034e-05, + "loss": 0.0433, + "step": 721 + }, + { + "epoch": 1.28, + "grad_norm": 0.27064821124076843, + "learning_rate": 6.204583650441201e-05, + "loss": 0.0796, + "step": 722 + }, + { + "epoch": 1.28, + "grad_norm": 0.11131159216165543, + "learning_rate": 6.195533907653004e-05, + "loss": 0.0229, + "step": 723 + }, + { + "epoch": 1.28, + "grad_norm": 0.22354401648044586, + "learning_rate": 6.18648000899717e-05, + "loss": 0.0475, + "step": 724 + }, + { + "epoch": 1.28, + "grad_norm": 0.19944117963314056, + "learning_rate": 6.177421985946499e-05, + "loss": 0.0413, + "step": 725 + }, + { + "epoch": 1.28, + "grad_norm": 0.32458746433258057, + "learning_rate": 6.168359869988134e-05, + "loss": 0.1205, + "step": 726 + }, + { + "epoch": 1.29, + "grad_norm": 0.19088833034038544, + "learning_rate": 6.159293692623443e-05, + "loss": 0.0626, + "step": 727 + }, + { + "epoch": 1.29, + "grad_norm": 0.2114744633436203, + "learning_rate": 6.150223485367914e-05, + "loss": 0.048, + "step": 728 + }, + { + "epoch": 1.29, + "grad_norm": 0.11308068782091141, + "learning_rate": 6.141149279751043e-05, + "loss": 0.0286, + "step": 729 + }, + { + "epoch": 1.29, + "grad_norm": 0.22453975677490234, + "learning_rate": 6.13207110731622e-05, + "loss": 0.0279, + "step": 730 + }, + { + "epoch": 1.29, + "grad_norm": 0.274513840675354, + "learning_rate": 6.122988999620634e-05, + "loss": 0.0553, + "step": 731 + }, + { + "epoch": 1.3, + "grad_norm": 0.2700372636318207, + "learning_rate": 6.113902988235145e-05, + "loss": 0.0973, + "step": 732 + }, + { + "epoch": 1.3, + "grad_norm": 0.3287579119205475, + "learning_rate": 6.104813104744188e-05, + "loss": 0.0853, + "step": 733 + }, + { + "epoch": 1.3, + "grad_norm": 0.37582048773765564, + "learning_rate": 6.095719380745654e-05, + "loss": 0.088, + "step": 734 + }, + { + "epoch": 1.3, + "grad_norm": 0.2009502649307251, + "learning_rate": 6.086621847850788e-05, + "loss": 0.0525, + "step": 735 + }, + { + "epoch": 1.3, + "grad_norm": 0.1417909413576126, + "learning_rate": 6.077520537684072e-05, + "loss": 0.0311, + "step": 736 + }, + { + "epoch": 1.3, + "grad_norm": 0.18328174948692322, + "learning_rate": 6.068415481883122e-05, + "loss": 0.0379, + "step": 737 + }, + { + "epoch": 1.31, + "grad_norm": 0.1513252556324005, + "learning_rate": 6.059306712098571e-05, + "loss": 0.0319, + "step": 738 + }, + { + "epoch": 1.31, + "grad_norm": 0.2544059753417969, + "learning_rate": 6.0501942599939666e-05, + "loss": 0.0593, + "step": 739 + }, + { + "epoch": 1.31, + "grad_norm": 0.4964008331298828, + "learning_rate": 6.0410781572456486e-05, + "loss": 0.0367, + "step": 740 + }, + { + "epoch": 1.31, + "grad_norm": 0.14776591956615448, + "learning_rate": 6.031958435542659e-05, + "loss": 0.0284, + "step": 741 + }, + { + "epoch": 1.31, + "grad_norm": 0.20590472221374512, + "learning_rate": 6.022835126586609e-05, + "loss": 0.0359, + "step": 742 + }, + { + "epoch": 1.31, + "grad_norm": 0.2493211179971695, + "learning_rate": 6.0137082620915863e-05, + "loss": 0.0424, + "step": 743 + }, + { + "epoch": 1.32, + "grad_norm": 0.31557443737983704, + "learning_rate": 6.0045778737840344e-05, + "loss": 0.0563, + "step": 744 + }, + { + "epoch": 1.32, + "grad_norm": 0.14257828891277313, + "learning_rate": 5.995443993402647e-05, + "loss": 0.024, + "step": 745 + }, + { + "epoch": 1.32, + "grad_norm": 0.21385452151298523, + "learning_rate": 5.9863066526982605e-05, + "loss": 0.0721, + "step": 746 + }, + { + "epoch": 1.32, + "grad_norm": 0.17539048194885254, + "learning_rate": 5.977165883433734e-05, + "loss": 0.025, + "step": 747 + }, + { + "epoch": 1.32, + "grad_norm": 0.28508231043815613, + "learning_rate": 5.9680217173838494e-05, + "loss": 0.0595, + "step": 748 + }, + { + "epoch": 1.33, + "grad_norm": 0.30929744243621826, + "learning_rate": 5.9588741863351924e-05, + "loss": 0.112, + "step": 749 + }, + { + "epoch": 1.33, + "grad_norm": 0.439656525850296, + "learning_rate": 5.949723322086053e-05, + "loss": 0.0427, + "step": 750 + }, + { + "epoch": 1.33, + "grad_norm": 0.2300054430961609, + "learning_rate": 5.940569156446298e-05, + "loss": 0.0437, + "step": 751 + }, + { + "epoch": 1.33, + "grad_norm": 0.4155109226703644, + "learning_rate": 5.931411721237279e-05, + "loss": 0.0569, + "step": 752 + }, + { + "epoch": 1.33, + "grad_norm": 0.25196224451065063, + "learning_rate": 5.922251048291707e-05, + "loss": 0.0413, + "step": 753 + }, + { + "epoch": 1.33, + "grad_norm": 0.5078486204147339, + "learning_rate": 5.913087169453554e-05, + "loss": 0.0988, + "step": 754 + }, + { + "epoch": 1.34, + "grad_norm": 0.26931652426719666, + "learning_rate": 5.9039201165779315e-05, + "loss": 0.0578, + "step": 755 + }, + { + "epoch": 1.34, + "grad_norm": 0.2641213834285736, + "learning_rate": 5.8947499215309834e-05, + "loss": 0.0362, + "step": 756 + }, + { + "epoch": 1.34, + "grad_norm": 0.23865339159965515, + "learning_rate": 5.8855766161897805e-05, + "loss": 0.0375, + "step": 757 + }, + { + "epoch": 1.34, + "grad_norm": 0.2594137191772461, + "learning_rate": 5.876400232442205e-05, + "loss": 0.0489, + "step": 758 + }, + { + "epoch": 1.34, + "grad_norm": 0.2721590995788574, + "learning_rate": 5.867220802186837e-05, + "loss": 0.0407, + "step": 759 + }, + { + "epoch": 1.34, + "grad_norm": 0.3681499660015106, + "learning_rate": 5.85803835733285e-05, + "loss": 0.0554, + "step": 760 + }, + { + "epoch": 1.35, + "grad_norm": 0.3132595121860504, + "learning_rate": 5.848852929799894e-05, + "loss": 0.0486, + "step": 761 + }, + { + "epoch": 1.35, + "grad_norm": 0.16972127556800842, + "learning_rate": 5.8396645515179884e-05, + "loss": 0.0473, + "step": 762 + }, + { + "epoch": 1.35, + "grad_norm": 0.30628886818885803, + "learning_rate": 5.83047325442741e-05, + "loss": 0.0664, + "step": 763 + }, + { + "epoch": 1.35, + "grad_norm": 0.3327179551124573, + "learning_rate": 5.8212790704785824e-05, + "loss": 0.0605, + "step": 764 + }, + { + "epoch": 1.35, + "grad_norm": 0.3301398754119873, + "learning_rate": 5.812082031631966e-05, + "loss": 0.0477, + "step": 765 + }, + { + "epoch": 1.36, + "grad_norm": 0.23960134387016296, + "learning_rate": 5.8028821698579385e-05, + "loss": 0.0376, + "step": 766 + }, + { + "epoch": 1.36, + "grad_norm": 0.2526357173919678, + "learning_rate": 5.7936795171367e-05, + "loss": 0.0712, + "step": 767 + }, + { + "epoch": 1.36, + "grad_norm": 0.32746273279190063, + "learning_rate": 5.784474105458143e-05, + "loss": 0.0542, + "step": 768 + }, + { + "epoch": 1.36, + "grad_norm": 0.10859230905771255, + "learning_rate": 5.77526596682176e-05, + "loss": 0.019, + "step": 769 + }, + { + "epoch": 1.36, + "grad_norm": 0.2908915877342224, + "learning_rate": 5.766055133236513e-05, + "loss": 0.142, + "step": 770 + }, + { + "epoch": 1.36, + "grad_norm": 0.26869770884513855, + "learning_rate": 5.7568416367207404e-05, + "loss": 0.0774, + "step": 771 + }, + { + "epoch": 1.37, + "grad_norm": 0.39681994915008545, + "learning_rate": 5.7476255093020326e-05, + "loss": 0.0632, + "step": 772 + }, + { + "epoch": 1.37, + "grad_norm": 0.14335761964321136, + "learning_rate": 5.7384067830171274e-05, + "loss": 0.03, + "step": 773 + }, + { + "epoch": 1.37, + "grad_norm": 0.1377771943807602, + "learning_rate": 5.729185489911797e-05, + "loss": 0.0263, + "step": 774 + }, + { + "epoch": 1.37, + "grad_norm": 0.19834232330322266, + "learning_rate": 5.719961662040733e-05, + "loss": 0.0506, + "step": 775 + }, + { + "epoch": 1.37, + "grad_norm": 0.14378659427165985, + "learning_rate": 5.710735331467444e-05, + "loss": 0.0285, + "step": 776 + }, + { + "epoch": 1.37, + "grad_norm": 0.25368401408195496, + "learning_rate": 5.701506530264132e-05, + "loss": 0.0584, + "step": 777 + }, + { + "epoch": 1.38, + "grad_norm": 0.12339203804731369, + "learning_rate": 5.692275290511592e-05, + "loss": 0.0282, + "step": 778 + }, + { + "epoch": 1.38, + "grad_norm": 0.203715518116951, + "learning_rate": 5.683041644299093e-05, + "loss": 0.0849, + "step": 779 + }, + { + "epoch": 1.38, + "grad_norm": 0.1526814103126526, + "learning_rate": 5.673805623724272e-05, + "loss": 0.0256, + "step": 780 + }, + { + "epoch": 1.38, + "grad_norm": 0.18840323388576508, + "learning_rate": 5.664567260893019e-05, + "loss": 0.048, + "step": 781 + }, + { + "epoch": 1.38, + "grad_norm": 0.15979206562042236, + "learning_rate": 5.6553265879193606e-05, + "loss": 0.0237, + "step": 782 + }, + { + "epoch": 1.39, + "grad_norm": 0.1128401905298233, + "learning_rate": 5.6460836369253624e-05, + "loss": 0.0213, + "step": 783 + }, + { + "epoch": 1.39, + "grad_norm": 0.1648949831724167, + "learning_rate": 5.6368384400410035e-05, + "loss": 0.0348, + "step": 784 + }, + { + "epoch": 1.39, + "grad_norm": 0.31091129779815674, + "learning_rate": 5.627591029404071e-05, + "loss": 0.0685, + "step": 785 + }, + { + "epoch": 1.39, + "grad_norm": 0.2921251654624939, + "learning_rate": 5.6183414371600496e-05, + "loss": 0.045, + "step": 786 + }, + { + "epoch": 1.39, + "grad_norm": 0.3398689925670624, + "learning_rate": 5.609089695462002e-05, + "loss": 0.0546, + "step": 787 + }, + { + "epoch": 1.39, + "grad_norm": 0.21610289812088013, + "learning_rate": 5.599835836470469e-05, + "loss": 0.0322, + "step": 788 + }, + { + "epoch": 1.4, + "grad_norm": 0.3218781054019928, + "learning_rate": 5.5905798923533484e-05, + "loss": 0.0331, + "step": 789 + }, + { + "epoch": 1.4, + "grad_norm": 0.5338783860206604, + "learning_rate": 5.581321895285787e-05, + "loss": 0.0764, + "step": 790 + }, + { + "epoch": 1.4, + "grad_norm": 0.2539553940296173, + "learning_rate": 5.5720618774500675e-05, + "loss": 0.0553, + "step": 791 + }, + { + "epoch": 1.4, + "grad_norm": 0.22199298441410065, + "learning_rate": 5.5627998710354957e-05, + "loss": 0.0304, + "step": 792 + }, + { + "epoch": 1.4, + "grad_norm": 0.13213643431663513, + "learning_rate": 5.5535359082382944e-05, + "loss": 0.0115, + "step": 793 + }, + { + "epoch": 1.4, + "grad_norm": 0.3686007857322693, + "learning_rate": 5.544270021261483e-05, + "loss": 0.0371, + "step": 794 + }, + { + "epoch": 1.41, + "grad_norm": 0.08815140277147293, + "learning_rate": 5.535002242314772e-05, + "loss": 0.0089, + "step": 795 + }, + { + "epoch": 1.41, + "grad_norm": 0.40321916341781616, + "learning_rate": 5.525732603614444e-05, + "loss": 0.0653, + "step": 796 + }, + { + "epoch": 1.41, + "grad_norm": 0.44097116589546204, + "learning_rate": 5.5164611373832544e-05, + "loss": 0.0555, + "step": 797 + }, + { + "epoch": 1.41, + "grad_norm": 0.5565125942230225, + "learning_rate": 5.5071878758503046e-05, + "loss": 0.0646, + "step": 798 + }, + { + "epoch": 1.41, + "grad_norm": 0.8579866290092468, + "learning_rate": 5.49791285125094e-05, + "loss": 0.1532, + "step": 799 + }, + { + "epoch": 1.42, + "grad_norm": 0.550639271736145, + "learning_rate": 5.488636095826636e-05, + "loss": 0.0574, + "step": 800 + }, + { + "epoch": 1.42, + "grad_norm": 0.07725897431373596, + "learning_rate": 5.479357641824877e-05, + "loss": 0.0087, + "step": 801 + }, + { + "epoch": 1.42, + "grad_norm": 0.25981655716896057, + "learning_rate": 5.470077521499063e-05, + "loss": 0.0328, + "step": 802 + }, + { + "epoch": 1.42, + "grad_norm": 0.07465404272079468, + "learning_rate": 5.4607957671083786e-05, + "loss": 0.0117, + "step": 803 + }, + { + "epoch": 1.42, + "grad_norm": 0.22613628208637238, + "learning_rate": 5.4515124109176904e-05, + "loss": 0.0596, + "step": 804 + }, + { + "epoch": 1.42, + "grad_norm": 0.20493067800998688, + "learning_rate": 5.442227485197435e-05, + "loss": 0.0394, + "step": 805 + }, + { + "epoch": 1.43, + "grad_norm": 0.2182394117116928, + "learning_rate": 5.4329410222235034e-05, + "loss": 0.0491, + "step": 806 + }, + { + "epoch": 1.43, + "grad_norm": 0.16270771622657776, + "learning_rate": 5.42365305427713e-05, + "loss": 0.0333, + "step": 807 + }, + { + "epoch": 1.43, + "grad_norm": 0.3527982234954834, + "learning_rate": 5.414363613644782e-05, + "loss": 0.1369, + "step": 808 + }, + { + "epoch": 1.43, + "grad_norm": 0.38832610845565796, + "learning_rate": 5.405072732618043e-05, + "loss": 0.0719, + "step": 809 + }, + { + "epoch": 1.43, + "grad_norm": 0.18581318855285645, + "learning_rate": 5.395780443493508e-05, + "loss": 0.0305, + "step": 810 + }, + { + "epoch": 1.43, + "grad_norm": 0.298115611076355, + "learning_rate": 5.386486778572665e-05, + "loss": 0.0676, + "step": 811 + }, + { + "epoch": 1.44, + "grad_norm": 0.15877433121204376, + "learning_rate": 5.3771917701617827e-05, + "loss": 0.0343, + "step": 812 + }, + { + "epoch": 1.44, + "grad_norm": 0.14393776655197144, + "learning_rate": 5.367895450571801e-05, + "loss": 0.0395, + "step": 813 + }, + { + "epoch": 1.44, + "grad_norm": 0.25177934765815735, + "learning_rate": 5.358597852118219e-05, + "loss": 0.0757, + "step": 814 + }, + { + "epoch": 1.44, + "grad_norm": 0.16125288605690002, + "learning_rate": 5.3492990071209806e-05, + "loss": 0.0432, + "step": 815 + }, + { + "epoch": 1.44, + "grad_norm": 0.21766537427902222, + "learning_rate": 5.3399989479043624e-05, + "loss": 0.087, + "step": 816 + }, + { + "epoch": 1.45, + "grad_norm": 0.18221743404865265, + "learning_rate": 5.3306977067968614e-05, + "loss": 0.0403, + "step": 817 + }, + { + "epoch": 1.45, + "grad_norm": 0.2517869472503662, + "learning_rate": 5.3213953161310825e-05, + "loss": 0.0666, + "step": 818 + }, + { + "epoch": 1.45, + "grad_norm": 0.20983122289180756, + "learning_rate": 5.3120918082436314e-05, + "loss": 0.0664, + "step": 819 + }, + { + "epoch": 1.45, + "grad_norm": 0.17914025485515594, + "learning_rate": 5.3027872154749915e-05, + "loss": 0.0382, + "step": 820 + }, + { + "epoch": 1.45, + "grad_norm": 0.12290598452091217, + "learning_rate": 5.2934815701694204e-05, + "loss": 0.0282, + "step": 821 + }, + { + "epoch": 1.45, + "grad_norm": 0.17699532210826874, + "learning_rate": 5.2841749046748345e-05, + "loss": 0.0413, + "step": 822 + }, + { + "epoch": 1.46, + "grad_norm": 0.3889511227607727, + "learning_rate": 5.274867251342694e-05, + "loss": 0.0758, + "step": 823 + }, + { + "epoch": 1.46, + "grad_norm": 0.24286973476409912, + "learning_rate": 5.2655586425278966e-05, + "loss": 0.0532, + "step": 824 + }, + { + "epoch": 1.46, + "grad_norm": 0.14911137521266937, + "learning_rate": 5.256249110588659e-05, + "loss": 0.0277, + "step": 825 + }, + { + "epoch": 1.46, + "grad_norm": 0.31271466612815857, + "learning_rate": 5.246938687886409e-05, + "loss": 0.0726, + "step": 826 + }, + { + "epoch": 1.46, + "grad_norm": 0.2684333920478821, + "learning_rate": 5.237627406785667e-05, + "loss": 0.0993, + "step": 827 + }, + { + "epoch": 1.46, + "grad_norm": 0.14797139167785645, + "learning_rate": 5.228315299653942e-05, + "loss": 0.0198, + "step": 828 + }, + { + "epoch": 1.47, + "grad_norm": 0.29548555612564087, + "learning_rate": 5.2190023988616113e-05, + "loss": 0.0562, + "step": 829 + }, + { + "epoch": 1.47, + "grad_norm": 0.29321712255477905, + "learning_rate": 5.2096887367818105e-05, + "loss": 0.1208, + "step": 830 + }, + { + "epoch": 1.47, + "grad_norm": 0.18331380188465118, + "learning_rate": 5.2003743457903256e-05, + "loss": 0.0256, + "step": 831 + }, + { + "epoch": 1.47, + "grad_norm": 0.21740898489952087, + "learning_rate": 5.1910592582654715e-05, + "loss": 0.057, + "step": 832 + }, + { + "epoch": 1.47, + "grad_norm": 0.2625051736831665, + "learning_rate": 5.181743506587989e-05, + "loss": 0.0667, + "step": 833 + }, + { + "epoch": 1.48, + "grad_norm": 0.2670525908470154, + "learning_rate": 5.172427123140923e-05, + "loss": 0.0883, + "step": 834 + }, + { + "epoch": 1.48, + "grad_norm": 0.33282265067100525, + "learning_rate": 5.1631101403095184e-05, + "loss": 0.0424, + "step": 835 + }, + { + "epoch": 1.48, + "grad_norm": 0.21608753502368927, + "learning_rate": 5.1537925904811004e-05, + "loss": 0.049, + "step": 836 + }, + { + "epoch": 1.48, + "grad_norm": 0.10450909286737442, + "learning_rate": 5.144474506044968e-05, + "loss": 0.0158, + "step": 837 + }, + { + "epoch": 1.48, + "grad_norm": 0.3188491761684418, + "learning_rate": 5.135155919392279e-05, + "loss": 0.0547, + "step": 838 + }, + { + "epoch": 1.48, + "grad_norm": 0.24398969113826752, + "learning_rate": 5.125836862915934e-05, + "loss": 0.053, + "step": 839 + }, + { + "epoch": 1.49, + "grad_norm": 0.1743936687707901, + "learning_rate": 5.116517369010466e-05, + "loss": 0.0239, + "step": 840 + }, + { + "epoch": 1.49, + "grad_norm": 0.180791437625885, + "learning_rate": 5.1071974700719326e-05, + "loss": 0.0864, + "step": 841 + }, + { + "epoch": 1.49, + "grad_norm": 0.19678902626037598, + "learning_rate": 5.0978771984978003e-05, + "loss": 0.0376, + "step": 842 + }, + { + "epoch": 1.49, + "grad_norm": 0.230797678232193, + "learning_rate": 5.0885565866868227e-05, + "loss": 0.0597, + "step": 843 + }, + { + "epoch": 1.49, + "grad_norm": 0.4890972971916199, + "learning_rate": 5.079235667038944e-05, + "loss": 0.0832, + "step": 844 + }, + { + "epoch": 1.49, + "grad_norm": 0.20508797466754913, + "learning_rate": 5.069914471955178e-05, + "loss": 0.0349, + "step": 845 + }, + { + "epoch": 1.5, + "grad_norm": 0.21593628823757172, + "learning_rate": 5.060593033837493e-05, + "loss": 0.0354, + "step": 846 + }, + { + "epoch": 1.5, + "grad_norm": 0.2712628245353699, + "learning_rate": 5.051271385088702e-05, + "loss": 0.0311, + "step": 847 + }, + { + "epoch": 1.5, + "grad_norm": 0.11844774335622787, + "learning_rate": 5.041949558112351e-05, + "loss": 0.0109, + "step": 848 + }, + { + "epoch": 1.5, + "grad_norm": 0.1798882633447647, + "learning_rate": 5.032627585312608e-05, + "loss": 0.0196, + "step": 849 + }, + { + "epoch": 1.5, + "grad_norm": 0.290019690990448, + "learning_rate": 5.023305499094144e-05, + "loss": 0.0667, + "step": 850 + }, + { + "epoch": 1.51, + "grad_norm": 0.24924272298812866, + "learning_rate": 5.013983331862027e-05, + "loss": 0.0556, + "step": 851 + }, + { + "epoch": 1.51, + "grad_norm": 0.22597135603427887, + "learning_rate": 5.004661116021605e-05, + "loss": 0.0495, + "step": 852 + }, + { + "epoch": 1.51, + "eval_loss": 0.07545028626918793, + "eval_runtime": 14.6561, + "eval_samples_per_second": 32.546, + "eval_steps_per_second": 8.188, + "step": 852 + }, + { + "epoch": 1.51, + "grad_norm": 0.15348747372627258, + "learning_rate": 4.9953388839783954e-05, + "loss": 0.0204, + "step": 853 + }, + { + "epoch": 1.51, + "grad_norm": 0.6507572531700134, + "learning_rate": 4.9860166681379745e-05, + "loss": 0.076, + "step": 854 + }, + { + "epoch": 1.51, + "grad_norm": 0.13102935254573822, + "learning_rate": 4.976694500905857e-05, + "loss": 0.0143, + "step": 855 + }, + { + "epoch": 1.51, + "grad_norm": 0.43004414439201355, + "learning_rate": 4.967372414687393e-05, + "loss": 0.0675, + "step": 856 + }, + { + "epoch": 1.52, + "grad_norm": 0.38339918851852417, + "learning_rate": 4.95805044188765e-05, + "loss": 0.0747, + "step": 857 + }, + { + "epoch": 1.52, + "grad_norm": 0.4646240472793579, + "learning_rate": 4.9487286149112986e-05, + "loss": 0.0883, + "step": 858 + }, + { + "epoch": 1.52, + "grad_norm": 0.2721651792526245, + "learning_rate": 4.9394069661625076e-05, + "loss": 0.062, + "step": 859 + }, + { + "epoch": 1.52, + "grad_norm": 0.356275349855423, + "learning_rate": 4.930085528044823e-05, + "loss": 0.0321, + "step": 860 + }, + { + "epoch": 1.52, + "grad_norm": 0.3066048324108124, + "learning_rate": 4.9207643329610556e-05, + "loss": 0.0525, + "step": 861 + }, + { + "epoch": 1.52, + "grad_norm": 0.12254035472869873, + "learning_rate": 4.911443413313179e-05, + "loss": 0.0106, + "step": 862 + }, + { + "epoch": 1.53, + "grad_norm": 0.19596797227859497, + "learning_rate": 4.9021228015022015e-05, + "loss": 0.0242, + "step": 863 + }, + { + "epoch": 1.53, + "grad_norm": 0.39066678285598755, + "learning_rate": 4.892802529928067e-05, + "loss": 0.0558, + "step": 864 + }, + { + "epoch": 1.53, + "grad_norm": 0.17401085793972015, + "learning_rate": 4.883482630989535e-05, + "loss": 0.0203, + "step": 865 + }, + { + "epoch": 1.53, + "grad_norm": 0.28903472423553467, + "learning_rate": 4.874163137084068e-05, + "loss": 0.0819, + "step": 866 + }, + { + "epoch": 1.53, + "grad_norm": 0.2746363580226898, + "learning_rate": 4.8648440806077226e-05, + "loss": 0.0696, + "step": 867 + }, + { + "epoch": 1.54, + "grad_norm": 0.40534642338752747, + "learning_rate": 4.8555254939550324e-05, + "loss": 0.111, + "step": 868 + }, + { + "epoch": 1.54, + "grad_norm": 0.41272208094596863, + "learning_rate": 4.8462074095188994e-05, + "loss": 0.1089, + "step": 869 + }, + { + "epoch": 1.54, + "grad_norm": 0.40718454122543335, + "learning_rate": 4.8368898596904834e-05, + "loss": 0.1339, + "step": 870 + }, + { + "epoch": 1.54, + "grad_norm": 0.14493143558502197, + "learning_rate": 4.827572876859078e-05, + "loss": 0.0227, + "step": 871 + }, + { + "epoch": 1.54, + "grad_norm": 0.20390640199184418, + "learning_rate": 4.8182564934120115e-05, + "loss": 0.0464, + "step": 872 + }, + { + "epoch": 1.54, + "grad_norm": 0.12927311658859253, + "learning_rate": 4.80894074173453e-05, + "loss": 0.0253, + "step": 873 + }, + { + "epoch": 1.55, + "grad_norm": 0.17653903365135193, + "learning_rate": 4.799625654209675e-05, + "loss": 0.0509, + "step": 874 + }, + { + "epoch": 1.55, + "grad_norm": 0.34687289595603943, + "learning_rate": 4.790311263218191e-05, + "loss": 0.0916, + "step": 875 + }, + { + "epoch": 1.55, + "grad_norm": 0.22851605713367462, + "learning_rate": 4.7809976011383905e-05, + "loss": 0.0857, + "step": 876 + }, + { + "epoch": 1.55, + "grad_norm": 0.27485382556915283, + "learning_rate": 4.771684700346059e-05, + "loss": 0.0623, + "step": 877 + }, + { + "epoch": 1.55, + "grad_norm": 0.21887461841106415, + "learning_rate": 4.762372593214335e-05, + "loss": 0.0573, + "step": 878 + }, + { + "epoch": 1.56, + "grad_norm": 0.2466115951538086, + "learning_rate": 4.753061312113592e-05, + "loss": 0.1039, + "step": 879 + }, + { + "epoch": 1.56, + "grad_norm": 0.344625860452652, + "learning_rate": 4.743750889411342e-05, + "loss": 0.0637, + "step": 880 + }, + { + "epoch": 1.56, + "grad_norm": 0.1676146686077118, + "learning_rate": 4.7344413574721046e-05, + "loss": 0.0372, + "step": 881 + }, + { + "epoch": 1.56, + "grad_norm": 0.14225785434246063, + "learning_rate": 4.725132748657307e-05, + "loss": 0.0506, + "step": 882 + }, + { + "epoch": 1.56, + "grad_norm": 0.19915729761123657, + "learning_rate": 4.715825095325168e-05, + "loss": 0.0459, + "step": 883 + }, + { + "epoch": 1.56, + "grad_norm": 0.20955249667167664, + "learning_rate": 4.70651842983058e-05, + "loss": 0.0539, + "step": 884 + }, + { + "epoch": 1.57, + "grad_norm": 0.171535924077034, + "learning_rate": 4.697212784525008e-05, + "loss": 0.0346, + "step": 885 + }, + { + "epoch": 1.57, + "grad_norm": 0.11981090158224106, + "learning_rate": 4.687908191756369e-05, + "loss": 0.0378, + "step": 886 + }, + { + "epoch": 1.57, + "grad_norm": 0.18210795521736145, + "learning_rate": 4.678604683868918e-05, + "loss": 0.0563, + "step": 887 + }, + { + "epoch": 1.57, + "grad_norm": 0.18884742259979248, + "learning_rate": 4.669302293203142e-05, + "loss": 0.0393, + "step": 888 + }, + { + "epoch": 1.57, + "grad_norm": 0.21338443458080292, + "learning_rate": 4.660001052095639e-05, + "loss": 0.054, + "step": 889 + }, + { + "epoch": 1.57, + "grad_norm": 0.16022799909114838, + "learning_rate": 4.65070099287902e-05, + "loss": 0.0497, + "step": 890 + }, + { + "epoch": 1.58, + "grad_norm": 0.30642077326774597, + "learning_rate": 4.641402147881782e-05, + "loss": 0.0702, + "step": 891 + }, + { + "epoch": 1.58, + "grad_norm": 0.24659690260887146, + "learning_rate": 4.6321045494282e-05, + "loss": 0.0986, + "step": 892 + }, + { + "epoch": 1.58, + "grad_norm": 0.4151371419429779, + "learning_rate": 4.62280822983822e-05, + "loss": 0.1064, + "step": 893 + }, + { + "epoch": 1.58, + "grad_norm": 0.19555744528770447, + "learning_rate": 4.613513221427337e-05, + "loss": 0.034, + "step": 894 + }, + { + "epoch": 1.58, + "grad_norm": 0.3575385510921478, + "learning_rate": 4.604219556506492e-05, + "loss": 0.0563, + "step": 895 + }, + { + "epoch": 1.59, + "grad_norm": 0.33982524275779724, + "learning_rate": 4.594927267381958e-05, + "loss": 0.1152, + "step": 896 + }, + { + "epoch": 1.59, + "grad_norm": 0.40054503083229065, + "learning_rate": 4.58563638635522e-05, + "loss": 0.0684, + "step": 897 + }, + { + "epoch": 1.59, + "grad_norm": 0.16741478443145752, + "learning_rate": 4.5763469457228695e-05, + "loss": 0.0221, + "step": 898 + }, + { + "epoch": 1.59, + "grad_norm": 0.30603042244911194, + "learning_rate": 4.5670589777764984e-05, + "loss": 0.0725, + "step": 899 + }, + { + "epoch": 1.59, + "grad_norm": 0.345217227935791, + "learning_rate": 4.5577725148025646e-05, + "loss": 0.062, + "step": 900 + }, + { + "epoch": 1.59, + "grad_norm": 0.4248473048210144, + "learning_rate": 4.54848758908231e-05, + "loss": 0.1482, + "step": 901 + }, + { + "epoch": 1.6, + "grad_norm": 0.25596097111701965, + "learning_rate": 4.5392042328916226e-05, + "loss": 0.0417, + "step": 902 + }, + { + "epoch": 1.6, + "grad_norm": 0.14021873474121094, + "learning_rate": 4.5299224785009374e-05, + "loss": 0.0242, + "step": 903 + }, + { + "epoch": 1.6, + "grad_norm": 0.16665437817573547, + "learning_rate": 4.5206423581751245e-05, + "loss": 0.0569, + "step": 904 + }, + { + "epoch": 1.6, + "grad_norm": 0.29362550377845764, + "learning_rate": 4.511363904173366e-05, + "loss": 0.068, + "step": 905 + }, + { + "epoch": 1.6, + "grad_norm": 0.44577184319496155, + "learning_rate": 4.5020871487490604e-05, + "loss": 0.0787, + "step": 906 + }, + { + "epoch": 1.6, + "grad_norm": 0.20594125986099243, + "learning_rate": 4.492812124149696e-05, + "loss": 0.0868, + "step": 907 + }, + { + "epoch": 1.61, + "grad_norm": 0.2906559109687805, + "learning_rate": 4.483538862616747e-05, + "loss": 0.0592, + "step": 908 + }, + { + "epoch": 1.61, + "grad_norm": 0.17545486986637115, + "learning_rate": 4.4742673963855576e-05, + "loss": 0.0225, + "step": 909 + }, + { + "epoch": 1.61, + "grad_norm": 0.18305286765098572, + "learning_rate": 4.46499775768523e-05, + "loss": 0.0483, + "step": 910 + }, + { + "epoch": 1.61, + "grad_norm": 0.2249644249677658, + "learning_rate": 4.455729978738517e-05, + "loss": 0.0383, + "step": 911 + }, + { + "epoch": 1.61, + "grad_norm": 0.3094448149204254, + "learning_rate": 4.446464091761706e-05, + "loss": 0.0533, + "step": 912 + }, + { + "epoch": 1.62, + "grad_norm": 0.22453933954238892, + "learning_rate": 4.437200128964504e-05, + "loss": 0.0435, + "step": 913 + }, + { + "epoch": 1.62, + "grad_norm": 0.1814616322517395, + "learning_rate": 4.4279381225499344e-05, + "loss": 0.0245, + "step": 914 + }, + { + "epoch": 1.62, + "grad_norm": 0.20599542558193207, + "learning_rate": 4.418678104714214e-05, + "loss": 0.0321, + "step": 915 + }, + { + "epoch": 1.62, + "grad_norm": 0.27197298407554626, + "learning_rate": 4.409420107646652e-05, + "loss": 0.0512, + "step": 916 + }, + { + "epoch": 1.62, + "grad_norm": 0.35009968280792236, + "learning_rate": 4.400164163529532e-05, + "loss": 0.0717, + "step": 917 + }, + { + "epoch": 1.62, + "grad_norm": 0.17196977138519287, + "learning_rate": 4.390910304537999e-05, + "loss": 0.033, + "step": 918 + }, + { + "epoch": 1.63, + "grad_norm": 0.1884760707616806, + "learning_rate": 4.381658562839953e-05, + "loss": 0.0526, + "step": 919 + }, + { + "epoch": 1.63, + "grad_norm": 0.4165942966938019, + "learning_rate": 4.3724089705959305e-05, + "loss": 0.0824, + "step": 920 + }, + { + "epoch": 1.63, + "grad_norm": 0.36213231086730957, + "learning_rate": 4.363161559958996e-05, + "loss": 0.0524, + "step": 921 + }, + { + "epoch": 1.63, + "grad_norm": 0.22675907611846924, + "learning_rate": 4.353916363074638e-05, + "loss": 0.0367, + "step": 922 + }, + { + "epoch": 1.63, + "grad_norm": 0.29561713337898254, + "learning_rate": 4.34467341208064e-05, + "loss": 0.0364, + "step": 923 + }, + { + "epoch": 1.63, + "grad_norm": 0.22054970264434814, + "learning_rate": 4.3354327391069826e-05, + "loss": 0.025, + "step": 924 + }, + { + "epoch": 1.64, + "grad_norm": 0.06693907827138901, + "learning_rate": 4.3261943762757287e-05, + "loss": 0.0104, + "step": 925 + }, + { + "epoch": 1.64, + "grad_norm": 0.339631050825119, + "learning_rate": 4.3169583557009064e-05, + "loss": 0.0732, + "step": 926 + }, + { + "epoch": 1.64, + "grad_norm": 0.2175193578004837, + "learning_rate": 4.307724709488409e-05, + "loss": 0.0464, + "step": 927 + }, + { + "epoch": 1.64, + "grad_norm": 0.23093104362487793, + "learning_rate": 4.298493469735869e-05, + "loss": 0.0335, + "step": 928 + }, + { + "epoch": 1.64, + "grad_norm": 0.38875579833984375, + "learning_rate": 4.289264668532557e-05, + "loss": 0.0327, + "step": 929 + }, + { + "epoch": 1.65, + "grad_norm": 0.05294647812843323, + "learning_rate": 4.280038337959268e-05, + "loss": 0.007, + "step": 930 + }, + { + "epoch": 1.65, + "grad_norm": 0.36184802651405334, + "learning_rate": 4.270814510088203e-05, + "loss": 0.0688, + "step": 931 + }, + { + "epoch": 1.65, + "grad_norm": 0.531517744064331, + "learning_rate": 4.2615932169828744e-05, + "loss": 0.1305, + "step": 932 + }, + { + "epoch": 1.65, + "grad_norm": 0.3471108376979828, + "learning_rate": 4.2523744906979686e-05, + "loss": 0.0236, + "step": 933 + }, + { + "epoch": 1.65, + "grad_norm": 0.2624709904193878, + "learning_rate": 4.24315836327926e-05, + "loss": 0.0272, + "step": 934 + }, + { + "epoch": 1.65, + "grad_norm": 0.3648707866668701, + "learning_rate": 4.233944866763489e-05, + "loss": 0.0384, + "step": 935 + }, + { + "epoch": 1.66, + "grad_norm": 0.2199166864156723, + "learning_rate": 4.224734033178241e-05, + "loss": 0.0347, + "step": 936 + }, + { + "epoch": 1.66, + "grad_norm": 0.44493308663368225, + "learning_rate": 4.2155258945418566e-05, + "loss": 0.0405, + "step": 937 + }, + { + "epoch": 1.66, + "grad_norm": 0.4102453291416168, + "learning_rate": 4.206320482863301e-05, + "loss": 0.0849, + "step": 938 + }, + { + "epoch": 1.66, + "grad_norm": 0.33510318398475647, + "learning_rate": 4.1971178301420613e-05, + "loss": 0.052, + "step": 939 + }, + { + "epoch": 1.66, + "grad_norm": 0.40965744853019714, + "learning_rate": 4.187917968368036e-05, + "loss": 0.0848, + "step": 940 + }, + { + "epoch": 1.66, + "grad_norm": 0.2755095064640045, + "learning_rate": 4.178720929521418e-05, + "loss": 0.0391, + "step": 941 + }, + { + "epoch": 1.67, + "grad_norm": 0.32818931341171265, + "learning_rate": 4.16952674557259e-05, + "loss": 0.0327, + "step": 942 + }, + { + "epoch": 1.67, + "grad_norm": 0.39538365602493286, + "learning_rate": 4.1603354484820134e-05, + "loss": 0.043, + "step": 943 + }, + { + "epoch": 1.67, + "grad_norm": 0.2221785932779312, + "learning_rate": 4.1511470702001074e-05, + "loss": 0.0288, + "step": 944 + }, + { + "epoch": 1.67, + "grad_norm": 0.2112448811531067, + "learning_rate": 4.141961642667152e-05, + "loss": 0.025, + "step": 945 + }, + { + "epoch": 1.67, + "grad_norm": 0.15548011660575867, + "learning_rate": 4.132779197813164e-05, + "loss": 0.0694, + "step": 946 + }, + { + "epoch": 1.68, + "grad_norm": 0.10790842771530151, + "learning_rate": 4.1235997675577956e-05, + "loss": 0.0124, + "step": 947 + }, + { + "epoch": 1.68, + "grad_norm": 0.42269936203956604, + "learning_rate": 4.11442338381022e-05, + "loss": 0.0937, + "step": 948 + }, + { + "epoch": 1.68, + "grad_norm": 0.36002832651138306, + "learning_rate": 4.105250078469018e-05, + "loss": 0.1242, + "step": 949 + }, + { + "epoch": 1.68, + "grad_norm": 0.22437407076358795, + "learning_rate": 4.0960798834220704e-05, + "loss": 0.0638, + "step": 950 + }, + { + "epoch": 1.68, + "grad_norm": 0.3967968225479126, + "learning_rate": 4.086912830546448e-05, + "loss": 0.0439, + "step": 951 + }, + { + "epoch": 1.68, + "grad_norm": 0.20550177991390228, + "learning_rate": 4.077748951708292e-05, + "loss": 0.0347, + "step": 952 + }, + { + "epoch": 1.69, + "grad_norm": 0.2020653337240219, + "learning_rate": 4.068588278762723e-05, + "loss": 0.0376, + "step": 953 + }, + { + "epoch": 1.69, + "grad_norm": 0.19614431262016296, + "learning_rate": 4.0594308435537024e-05, + "loss": 0.032, + "step": 954 + }, + { + "epoch": 1.69, + "grad_norm": 0.08721073716878891, + "learning_rate": 4.0502766779139484e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.69, + "grad_norm": 0.2303171306848526, + "learning_rate": 4.041125813664808e-05, + "loss": 0.0268, + "step": 956 + }, + { + "epoch": 1.69, + "grad_norm": 0.2725672721862793, + "learning_rate": 4.031978282616151e-05, + "loss": 0.0413, + "step": 957 + }, + { + "epoch": 1.69, + "grad_norm": 0.28184202313423157, + "learning_rate": 4.0228341165662685e-05, + "loss": 0.0383, + "step": 958 + }, + { + "epoch": 1.7, + "grad_norm": 0.28514358401298523, + "learning_rate": 4.0136933473017407e-05, + "loss": 0.044, + "step": 959 + }, + { + "epoch": 1.7, + "grad_norm": 0.374714732170105, + "learning_rate": 4.004556006597353e-05, + "loss": 0.044, + "step": 960 + }, + { + "epoch": 1.7, + "grad_norm": 0.19936969876289368, + "learning_rate": 3.9954221262159674e-05, + "loss": 0.0334, + "step": 961 + }, + { + "epoch": 1.7, + "grad_norm": 0.3212338984012604, + "learning_rate": 3.986291737908414e-05, + "loss": 0.0473, + "step": 962 + }, + { + "epoch": 1.7, + "grad_norm": 0.28045183420181274, + "learning_rate": 3.9771648734133906e-05, + "loss": 0.0321, + "step": 963 + }, + { + "epoch": 1.71, + "grad_norm": 0.34167036414146423, + "learning_rate": 3.968041564457342e-05, + "loss": 0.0696, + "step": 964 + }, + { + "epoch": 1.71, + "grad_norm": 0.5529135465621948, + "learning_rate": 3.958921842754351e-05, + "loss": 0.131, + "step": 965 + }, + { + "epoch": 1.71, + "grad_norm": 0.275803804397583, + "learning_rate": 3.949805740006036e-05, + "loss": 0.0436, + "step": 966 + }, + { + "epoch": 1.71, + "grad_norm": 0.3003288209438324, + "learning_rate": 3.94069328790143e-05, + "loss": 0.073, + "step": 967 + }, + { + "epoch": 1.71, + "grad_norm": 0.20078504085540771, + "learning_rate": 3.9315845181168784e-05, + "loss": 0.0425, + "step": 968 + }, + { + "epoch": 1.71, + "grad_norm": 0.3904169797897339, + "learning_rate": 3.9224794623159294e-05, + "loss": 0.0668, + "step": 969 + }, + { + "epoch": 1.72, + "grad_norm": 0.212997168302536, + "learning_rate": 3.913378152149214e-05, + "loss": 0.0436, + "step": 970 + }, + { + "epoch": 1.72, + "grad_norm": 0.04039880260825157, + "learning_rate": 3.904280619254348e-05, + "loss": 0.0077, + "step": 971 + }, + { + "epoch": 1.72, + "grad_norm": 0.21076536178588867, + "learning_rate": 3.895186895255814e-05, + "loss": 0.0677, + "step": 972 + }, + { + "epoch": 1.72, + "grad_norm": 0.37436169385910034, + "learning_rate": 3.886097011764856e-05, + "loss": 0.0294, + "step": 973 + }, + { + "epoch": 1.72, + "grad_norm": 0.26611942052841187, + "learning_rate": 3.877011000379367e-05, + "loss": 0.057, + "step": 974 + }, + { + "epoch": 1.72, + "grad_norm": 0.32198566198349, + "learning_rate": 3.8679288926837804e-05, + "loss": 0.0583, + "step": 975 + }, + { + "epoch": 1.73, + "grad_norm": 0.2785477340221405, + "learning_rate": 3.8588507202489586e-05, + "loss": 0.0913, + "step": 976 + }, + { + "epoch": 1.73, + "grad_norm": 0.20920749008655548, + "learning_rate": 3.8497765146320876e-05, + "loss": 0.0454, + "step": 977 + }, + { + "epoch": 1.73, + "grad_norm": 0.31738653779029846, + "learning_rate": 3.840706307376557e-05, + "loss": 0.0464, + "step": 978 + }, + { + "epoch": 1.73, + "grad_norm": 0.1887190192937851, + "learning_rate": 3.8316401300118675e-05, + "loss": 0.026, + "step": 979 + }, + { + "epoch": 1.73, + "grad_norm": 0.22016988694667816, + "learning_rate": 3.8225780140535025e-05, + "loss": 0.0375, + "step": 980 + }, + { + "epoch": 1.74, + "grad_norm": 0.2261650264263153, + "learning_rate": 3.813519991002831e-05, + "loss": 0.0368, + "step": 981 + }, + { + "epoch": 1.74, + "grad_norm": 0.3108493983745575, + "learning_rate": 3.804466092346997e-05, + "loss": 0.0539, + "step": 982 + }, + { + "epoch": 1.74, + "grad_norm": 0.23392857611179352, + "learning_rate": 3.7954163495587995e-05, + "loss": 0.0363, + "step": 983 + }, + { + "epoch": 1.74, + "grad_norm": 0.23699642717838287, + "learning_rate": 3.786370794096603e-05, + "loss": 0.0362, + "step": 984 + }, + { + "epoch": 1.74, + "grad_norm": 0.29672032594680786, + "learning_rate": 3.777329457404202e-05, + "loss": 0.0388, + "step": 985 + }, + { + "epoch": 1.74, + "grad_norm": 0.25258907675743103, + "learning_rate": 3.768292370910737e-05, + "loss": 0.0278, + "step": 986 + }, + { + "epoch": 1.75, + "grad_norm": 0.40179169178009033, + "learning_rate": 3.759259566030571e-05, + "loss": 0.1118, + "step": 987 + }, + { + "epoch": 1.75, + "grad_norm": 0.3777885437011719, + "learning_rate": 3.750231074163179e-05, + "loss": 0.073, + "step": 988 + }, + { + "epoch": 1.75, + "grad_norm": 0.39896661043167114, + "learning_rate": 3.7412069266930516e-05, + "loss": 0.0428, + "step": 989 + }, + { + "epoch": 1.75, + "grad_norm": 0.26577284932136536, + "learning_rate": 3.7321871549895714e-05, + "loss": 0.0335, + "step": 990 + }, + { + "epoch": 1.75, + "grad_norm": 0.5360684990882874, + "learning_rate": 3.7231717904069094e-05, + "loss": 0.0979, + "step": 991 + }, + { + "epoch": 1.75, + "grad_norm": 0.21000511944293976, + "learning_rate": 3.714160864283923e-05, + "loss": 0.0156, + "step": 992 + }, + { + "epoch": 1.76, + "grad_norm": 0.11425631493330002, + "learning_rate": 3.7051544079440336e-05, + "loss": 0.0143, + "step": 993 + }, + { + "epoch": 1.76, + "grad_norm": 0.20283763110637665, + "learning_rate": 3.696152452695128e-05, + "loss": 0.0627, + "step": 994 + }, + { + "epoch": 1.76, + "eval_loss": 0.07780980318784714, + "eval_runtime": 14.6775, + "eval_samples_per_second": 32.499, + "eval_steps_per_second": 8.176, + "step": 994 + }, + { + "epoch": 1.76, + "grad_norm": 0.13721764087677002, + "learning_rate": 3.68715502982945e-05, + "loss": 0.0154, + "step": 995 + }, + { + "epoch": 1.76, + "grad_norm": 0.397158145904541, + "learning_rate": 3.678162170623481e-05, + "loss": 0.0494, + "step": 996 + }, + { + "epoch": 1.76, + "grad_norm": 0.47730910778045654, + "learning_rate": 3.669173906337846e-05, + "loss": 0.0897, + "step": 997 + }, + { + "epoch": 1.77, + "grad_norm": 0.19200514256954193, + "learning_rate": 3.6601902682171894e-05, + "loss": 0.0145, + "step": 998 + }, + { + "epoch": 1.77, + "grad_norm": 0.39415818452835083, + "learning_rate": 3.65121128749008e-05, + "loss": 0.0778, + "step": 999 + }, + { + "epoch": 1.77, + "grad_norm": 0.17673304677009583, + "learning_rate": 3.642236995368897e-05, + "loss": 0.0211, + "step": 1000 + }, + { + "epoch": 1.77, + "grad_norm": 0.3190731108188629, + "learning_rate": 3.633267423049717e-05, + "loss": 0.0856, + "step": 1001 + }, + { + "epoch": 1.77, + "grad_norm": 0.3833164572715759, + "learning_rate": 3.624302601712213e-05, + "loss": 0.0687, + "step": 1002 + }, + { + "epoch": 1.77, + "grad_norm": 0.2938999831676483, + "learning_rate": 3.6153425625195425e-05, + "loss": 0.0717, + "step": 1003 + }, + { + "epoch": 1.78, + "grad_norm": 0.2645472586154938, + "learning_rate": 3.606387336618237e-05, + "loss": 0.0341, + "step": 1004 + }, + { + "epoch": 1.78, + "grad_norm": 0.1330021619796753, + "learning_rate": 3.597436955138102e-05, + "loss": 0.0244, + "step": 1005 + }, + { + "epoch": 1.78, + "grad_norm": 0.34308159351348877, + "learning_rate": 3.588491449192096e-05, + "loss": 0.091, + "step": 1006 + }, + { + "epoch": 1.78, + "grad_norm": 0.2861696481704712, + "learning_rate": 3.579550849876233e-05, + "loss": 0.0601, + "step": 1007 + }, + { + "epoch": 1.78, + "grad_norm": 0.32016104459762573, + "learning_rate": 3.570615188269473e-05, + "loss": 0.0699, + "step": 1008 + }, + { + "epoch": 1.79, + "grad_norm": 0.2865599989891052, + "learning_rate": 3.561684495433605e-05, + "loss": 0.0742, + "step": 1009 + }, + { + "epoch": 1.79, + "grad_norm": 0.2045123279094696, + "learning_rate": 3.5527588024131544e-05, + "loss": 0.0323, + "step": 1010 + }, + { + "epoch": 1.79, + "grad_norm": 0.13562075793743134, + "learning_rate": 3.5438381402352574e-05, + "loss": 0.0167, + "step": 1011 + }, + { + "epoch": 1.79, + "grad_norm": 0.2536921799182892, + "learning_rate": 3.534922539909569e-05, + "loss": 0.047, + "step": 1012 + }, + { + "epoch": 1.79, + "grad_norm": 0.193417489528656, + "learning_rate": 3.5260120324281474e-05, + "loss": 0.0349, + "step": 1013 + }, + { + "epoch": 1.79, + "grad_norm": 0.18863309919834137, + "learning_rate": 3.517106648765343e-05, + "loss": 0.0261, + "step": 1014 + }, + { + "epoch": 1.8, + "grad_norm": 0.5035936832427979, + "learning_rate": 3.5082064198777e-05, + "loss": 0.0963, + "step": 1015 + }, + { + "epoch": 1.8, + "grad_norm": 0.33512285351753235, + "learning_rate": 3.499311376703842e-05, + "loss": 0.0534, + "step": 1016 + }, + { + "epoch": 1.8, + "grad_norm": 0.14217574894428253, + "learning_rate": 3.4904215501643646e-05, + "loss": 0.0246, + "step": 1017 + }, + { + "epoch": 1.8, + "grad_norm": 0.3124421536922455, + "learning_rate": 3.4815369711617316e-05, + "loss": 0.0498, + "step": 1018 + }, + { + "epoch": 1.8, + "grad_norm": 0.22936655580997467, + "learning_rate": 3.4726576705801636e-05, + "loss": 0.0249, + "step": 1019 + }, + { + "epoch": 1.8, + "grad_norm": 0.5534436106681824, + "learning_rate": 3.463783679285535e-05, + "loss": 0.1696, + "step": 1020 + }, + { + "epoch": 1.81, + "grad_norm": 0.3127197027206421, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.045, + "step": 1021 + }, + { + "epoch": 1.81, + "grad_norm": 0.13819609582424164, + "learning_rate": 3.446051747928202e-05, + "loss": 0.0203, + "step": 1022 + }, + { + "epoch": 1.81, + "grad_norm": 0.35015806555747986, + "learning_rate": 3.4371938695045346e-05, + "loss": 0.0608, + "step": 1023 + }, + { + "epoch": 1.81, + "grad_norm": 0.3767643868923187, + "learning_rate": 3.428341423645668e-05, + "loss": 0.0686, + "step": 1024 + }, + { + "epoch": 1.81, + "grad_norm": 0.3282710611820221, + "learning_rate": 3.419494441124121e-05, + "loss": 0.0697, + "step": 1025 + }, + { + "epoch": 1.82, + "grad_norm": 0.24474768340587616, + "learning_rate": 3.4106529526934306e-05, + "loss": 0.0583, + "step": 1026 + }, + { + "epoch": 1.82, + "grad_norm": 0.24781620502471924, + "learning_rate": 3.4018169890880225e-05, + "loss": 0.0327, + "step": 1027 + }, + { + "epoch": 1.82, + "grad_norm": 0.2474932074546814, + "learning_rate": 3.392986581023126e-05, + "loss": 0.0679, + "step": 1028 + }, + { + "epoch": 1.82, + "grad_norm": 0.39474180340766907, + "learning_rate": 3.384161759194658e-05, + "loss": 0.0713, + "step": 1029 + }, + { + "epoch": 1.82, + "grad_norm": 0.13963012397289276, + "learning_rate": 3.375342554279111e-05, + "loss": 0.0179, + "step": 1030 + }, + { + "epoch": 1.82, + "grad_norm": 0.32144859433174133, + "learning_rate": 3.3665289969334585e-05, + "loss": 0.0447, + "step": 1031 + }, + { + "epoch": 1.83, + "grad_norm": 0.14767055213451385, + "learning_rate": 3.3577211177950385e-05, + "loss": 0.017, + "step": 1032 + }, + { + "epoch": 1.83, + "grad_norm": 0.3088414967060089, + "learning_rate": 3.348918947481452e-05, + "loss": 0.0483, + "step": 1033 + }, + { + "epoch": 1.83, + "grad_norm": 0.33189231157302856, + "learning_rate": 3.340122516590456e-05, + "loss": 0.0382, + "step": 1034 + }, + { + "epoch": 1.83, + "grad_norm": 0.4593893885612488, + "learning_rate": 3.3313318556998526e-05, + "loss": 0.0523, + "step": 1035 + }, + { + "epoch": 1.83, + "grad_norm": 0.253412127494812, + "learning_rate": 3.322546995367394e-05, + "loss": 0.0212, + "step": 1036 + }, + { + "epoch": 1.83, + "grad_norm": 0.42397648096084595, + "learning_rate": 3.3137679661306576e-05, + "loss": 0.087, + "step": 1037 + }, + { + "epoch": 1.84, + "grad_norm": 0.40718910098075867, + "learning_rate": 3.3049947985069616e-05, + "loss": 0.0965, + "step": 1038 + }, + { + "epoch": 1.84, + "grad_norm": 0.3604757487773895, + "learning_rate": 3.2962275229932446e-05, + "loss": 0.0973, + "step": 1039 + }, + { + "epoch": 1.84, + "grad_norm": 0.2129022628068924, + "learning_rate": 3.287466170065959e-05, + "loss": 0.0384, + "step": 1040 + }, + { + "epoch": 1.84, + "grad_norm": 0.20552971959114075, + "learning_rate": 3.2787107701809754e-05, + "loss": 0.0305, + "step": 1041 + }, + { + "epoch": 1.84, + "grad_norm": 0.41909754276275635, + "learning_rate": 3.269961353773469e-05, + "loss": 0.0725, + "step": 1042 + }, + { + "epoch": 1.85, + "grad_norm": 0.2766873240470886, + "learning_rate": 3.261217951257813e-05, + "loss": 0.0649, + "step": 1043 + }, + { + "epoch": 1.85, + "grad_norm": 0.1571783572435379, + "learning_rate": 3.252480593027478e-05, + "loss": 0.0238, + "step": 1044 + }, + { + "epoch": 1.85, + "grad_norm": 0.2809221148490906, + "learning_rate": 3.243749309454922e-05, + "loss": 0.0613, + "step": 1045 + }, + { + "epoch": 1.85, + "grad_norm": 0.4999224841594696, + "learning_rate": 3.235024130891487e-05, + "loss": 0.0806, + "step": 1046 + }, + { + "epoch": 1.85, + "grad_norm": 0.1895889937877655, + "learning_rate": 3.226305087667295e-05, + "loss": 0.026, + "step": 1047 + }, + { + "epoch": 1.85, + "grad_norm": 0.220509871840477, + "learning_rate": 3.217592210091137e-05, + "loss": 0.0681, + "step": 1048 + }, + { + "epoch": 1.86, + "grad_norm": 0.19226831197738647, + "learning_rate": 3.208885528450376e-05, + "loss": 0.0232, + "step": 1049 + }, + { + "epoch": 1.86, + "grad_norm": 0.30779534578323364, + "learning_rate": 3.200185073010831e-05, + "loss": 0.0547, + "step": 1050 + }, + { + "epoch": 1.86, + "grad_norm": 0.16252338886260986, + "learning_rate": 3.1914908740166795e-05, + "loss": 0.0237, + "step": 1051 + }, + { + "epoch": 1.86, + "grad_norm": 0.4130539000034332, + "learning_rate": 3.182802961690357e-05, + "loss": 0.0437, + "step": 1052 + }, + { + "epoch": 1.86, + "grad_norm": 0.20889542996883392, + "learning_rate": 3.1741213662324365e-05, + "loss": 0.0493, + "step": 1053 + }, + { + "epoch": 1.86, + "grad_norm": 0.27447709441185, + "learning_rate": 3.165446117821538e-05, + "loss": 0.0859, + "step": 1054 + }, + { + "epoch": 1.87, + "grad_norm": 0.07740370184183121, + "learning_rate": 3.1567772466142156e-05, + "loss": 0.011, + "step": 1055 + }, + { + "epoch": 1.87, + "grad_norm": 0.1462429016828537, + "learning_rate": 3.148114782744855e-05, + "loss": 0.0228, + "step": 1056 + }, + { + "epoch": 1.87, + "grad_norm": 0.34325939416885376, + "learning_rate": 3.139458756325576e-05, + "loss": 0.0928, + "step": 1057 + }, + { + "epoch": 1.87, + "grad_norm": 0.2712673246860504, + "learning_rate": 3.130809197446106e-05, + "loss": 0.0408, + "step": 1058 + }, + { + "epoch": 1.87, + "grad_norm": 0.24180567264556885, + "learning_rate": 3.122166136173706e-05, + "loss": 0.0309, + "step": 1059 + }, + { + "epoch": 1.88, + "grad_norm": 0.2045336663722992, + "learning_rate": 3.113529602553042e-05, + "loss": 0.0391, + "step": 1060 + }, + { + "epoch": 1.88, + "grad_norm": 0.09178590029478073, + "learning_rate": 3.104899626606088e-05, + "loss": 0.0132, + "step": 1061 + }, + { + "epoch": 1.88, + "grad_norm": 0.300592303276062, + "learning_rate": 3.0962762383320285e-05, + "loss": 0.0787, + "step": 1062 + }, + { + "epoch": 1.88, + "grad_norm": 0.31034108996391296, + "learning_rate": 3.08765946770714e-05, + "loss": 0.0307, + "step": 1063 + }, + { + "epoch": 1.88, + "grad_norm": 0.1603342443704605, + "learning_rate": 3.0790493446847024e-05, + "loss": 0.0209, + "step": 1064 + }, + { + "epoch": 1.88, + "grad_norm": 0.23990066349506378, + "learning_rate": 3.070445899194885e-05, + "loss": 0.0361, + "step": 1065 + }, + { + "epoch": 1.89, + "grad_norm": 0.15933218598365784, + "learning_rate": 3.061849161144641e-05, + "loss": 0.0137, + "step": 1066 + }, + { + "epoch": 1.89, + "grad_norm": 0.4810096025466919, + "learning_rate": 3.053259160417613e-05, + "loss": 0.1255, + "step": 1067 + }, + { + "epoch": 1.89, + "grad_norm": 0.4099353849887848, + "learning_rate": 3.0446759268740233e-05, + "loss": 0.0718, + "step": 1068 + }, + { + "epoch": 1.89, + "grad_norm": 0.39221569895744324, + "learning_rate": 3.0360994903505653e-05, + "loss": 0.0721, + "step": 1069 + }, + { + "epoch": 1.89, + "grad_norm": 0.48040878772735596, + "learning_rate": 3.02752988066031e-05, + "loss": 0.1077, + "step": 1070 + }, + { + "epoch": 1.89, + "grad_norm": 0.1548435539007187, + "learning_rate": 3.018967127592595e-05, + "loss": 0.0273, + "step": 1071 + }, + { + "epoch": 1.9, + "grad_norm": 0.37816908955574036, + "learning_rate": 3.010411260912922e-05, + "loss": 0.0421, + "step": 1072 + }, + { + "epoch": 1.9, + "grad_norm": 0.2902359366416931, + "learning_rate": 3.0018623103628596e-05, + "loss": 0.0645, + "step": 1073 + }, + { + "epoch": 1.9, + "grad_norm": 0.32010090351104736, + "learning_rate": 2.9933203056599275e-05, + "loss": 0.0444, + "step": 1074 + }, + { + "epoch": 1.9, + "grad_norm": 0.47379517555236816, + "learning_rate": 2.984785276497507e-05, + "loss": 0.0423, + "step": 1075 + }, + { + "epoch": 1.9, + "grad_norm": 0.3198046088218689, + "learning_rate": 2.9762572525447262e-05, + "loss": 0.0678, + "step": 1076 + }, + { + "epoch": 1.91, + "grad_norm": 0.33741793036460876, + "learning_rate": 2.9677362634463647e-05, + "loss": 0.0643, + "step": 1077 + }, + { + "epoch": 1.91, + "grad_norm": 0.2264060080051422, + "learning_rate": 2.9592223388227503e-05, + "loss": 0.0275, + "step": 1078 + }, + { + "epoch": 1.91, + "grad_norm": 0.3069595694541931, + "learning_rate": 2.9507155082696482e-05, + "loss": 0.0481, + "step": 1079 + }, + { + "epoch": 1.91, + "grad_norm": 0.26178085803985596, + "learning_rate": 2.9422158013581658e-05, + "loss": 0.0298, + "step": 1080 + }, + { + "epoch": 1.91, + "grad_norm": 0.4191998243331909, + "learning_rate": 2.93372324763465e-05, + "loss": 0.1156, + "step": 1081 + }, + { + "epoch": 1.91, + "grad_norm": 0.21308496594429016, + "learning_rate": 2.9252378766205758e-05, + "loss": 0.0478, + "step": 1082 + }, + { + "epoch": 1.92, + "grad_norm": 0.20033082365989685, + "learning_rate": 2.9167597178124585e-05, + "loss": 0.0262, + "step": 1083 + }, + { + "epoch": 1.92, + "grad_norm": 0.21688255667686462, + "learning_rate": 2.9082888006817365e-05, + "loss": 0.0767, + "step": 1084 + }, + { + "epoch": 1.92, + "grad_norm": 0.300791472196579, + "learning_rate": 2.899825154674674e-05, + "loss": 0.0464, + "step": 1085 + }, + { + "epoch": 1.92, + "grad_norm": 0.2416476011276245, + "learning_rate": 2.8913688092122664e-05, + "loss": 0.0605, + "step": 1086 + }, + { + "epoch": 1.92, + "grad_norm": 0.2521096169948578, + "learning_rate": 2.8829197936901232e-05, + "loss": 0.0293, + "step": 1087 + }, + { + "epoch": 1.92, + "grad_norm": 0.3229115605354309, + "learning_rate": 2.8744781374783813e-05, + "loss": 0.0435, + "step": 1088 + }, + { + "epoch": 1.93, + "grad_norm": 0.0780615508556366, + "learning_rate": 2.8660438699215898e-05, + "loss": 0.0109, + "step": 1089 + }, + { + "epoch": 1.93, + "grad_norm": 0.2879962623119354, + "learning_rate": 2.8576170203386143e-05, + "loss": 0.0665, + "step": 1090 + }, + { + "epoch": 1.93, + "grad_norm": 0.19985683262348175, + "learning_rate": 2.8491976180225388e-05, + "loss": 0.0378, + "step": 1091 + }, + { + "epoch": 1.93, + "grad_norm": 0.15360887348651886, + "learning_rate": 2.840785692240553e-05, + "loss": 0.0257, + "step": 1092 + }, + { + "epoch": 1.93, + "grad_norm": 0.19224387407302856, + "learning_rate": 2.832381272233864e-05, + "loss": 0.0529, + "step": 1093 + }, + { + "epoch": 1.94, + "grad_norm": 0.2037738561630249, + "learning_rate": 2.8239843872175814e-05, + "loss": 0.0228, + "step": 1094 + }, + { + "epoch": 1.94, + "grad_norm": 0.380874902009964, + "learning_rate": 2.8155950663806235e-05, + "loss": 0.0525, + "step": 1095 + }, + { + "epoch": 1.94, + "grad_norm": 0.28522253036499023, + "learning_rate": 2.8072133388856192e-05, + "loss": 0.0615, + "step": 1096 + }, + { + "epoch": 1.94, + "grad_norm": 0.1098146066069603, + "learning_rate": 2.7988392338687926e-05, + "loss": 0.0159, + "step": 1097 + }, + { + "epoch": 1.94, + "grad_norm": 0.2615334987640381, + "learning_rate": 2.7904727804398812e-05, + "loss": 0.0353, + "step": 1098 + }, + { + "epoch": 1.94, + "grad_norm": 0.2041955441236496, + "learning_rate": 2.7821140076820162e-05, + "loss": 0.0187, + "step": 1099 + }, + { + "epoch": 1.95, + "grad_norm": 0.1846192330121994, + "learning_rate": 2.773762944651632e-05, + "loss": 0.0554, + "step": 1100 + }, + { + "epoch": 1.95, + "grad_norm": 0.17711102962493896, + "learning_rate": 2.765419620378366e-05, + "loss": 0.0342, + "step": 1101 + }, + { + "epoch": 1.95, + "grad_norm": 0.3703756332397461, + "learning_rate": 2.7570840638649486e-05, + "loss": 0.0378, + "step": 1102 + }, + { + "epoch": 1.95, + "grad_norm": 0.4282096326351166, + "learning_rate": 2.7487563040871145e-05, + "loss": 0.0789, + "step": 1103 + }, + { + "epoch": 1.95, + "grad_norm": 0.32506605982780457, + "learning_rate": 2.740436369993491e-05, + "loss": 0.0337, + "step": 1104 + }, + { + "epoch": 1.95, + "grad_norm": 0.250688374042511, + "learning_rate": 2.7321242905055013e-05, + "loss": 0.0554, + "step": 1105 + }, + { + "epoch": 1.96, + "grad_norm": 0.3557257354259491, + "learning_rate": 2.7238200945172698e-05, + "loss": 0.0356, + "step": 1106 + }, + { + "epoch": 1.96, + "grad_norm": 0.3472774028778076, + "learning_rate": 2.715523810895515e-05, + "loss": 0.0348, + "step": 1107 + }, + { + "epoch": 1.96, + "grad_norm": 0.07373315095901489, + "learning_rate": 2.707235468479449e-05, + "loss": 0.0088, + "step": 1108 + }, + { + "epoch": 1.96, + "grad_norm": 0.21439437568187714, + "learning_rate": 2.6989550960806768e-05, + "loss": 0.0222, + "step": 1109 + }, + { + "epoch": 1.96, + "grad_norm": 0.2730681002140045, + "learning_rate": 2.690682722483102e-05, + "loss": 0.068, + "step": 1110 + }, + { + "epoch": 1.97, + "grad_norm": 0.41124334931373596, + "learning_rate": 2.6824183764428224e-05, + "loss": 0.086, + "step": 1111 + }, + { + "epoch": 1.97, + "grad_norm": 0.6637737154960632, + "learning_rate": 2.6741620866880335e-05, + "loss": 0.0365, + "step": 1112 + }, + { + "epoch": 1.97, + "grad_norm": 0.425441712141037, + "learning_rate": 2.665913881918921e-05, + "loss": 0.095, + "step": 1113 + }, + { + "epoch": 1.97, + "grad_norm": 0.5520187020301819, + "learning_rate": 2.6576737908075668e-05, + "loss": 0.0514, + "step": 1114 + }, + { + "epoch": 1.97, + "grad_norm": 0.5284621119499207, + "learning_rate": 2.6494418419978482e-05, + "loss": 0.0593, + "step": 1115 + }, + { + "epoch": 1.97, + "grad_norm": 0.08148845285177231, + "learning_rate": 2.641218064105341e-05, + "loss": 0.0084, + "step": 1116 + }, + { + "epoch": 1.98, + "grad_norm": 0.8642109036445618, + "learning_rate": 2.6330024857172192e-05, + "loss": 0.0766, + "step": 1117 + }, + { + "epoch": 1.98, + "grad_norm": 0.40509146451950073, + "learning_rate": 2.6247951353921485e-05, + "loss": 0.1148, + "step": 1118 + }, + { + "epoch": 1.98, + "grad_norm": 0.2981242537498474, + "learning_rate": 2.616596041660194e-05, + "loss": 0.0666, + "step": 1119 + }, + { + "epoch": 1.98, + "grad_norm": 0.21514151990413666, + "learning_rate": 2.6084052330227238e-05, + "loss": 0.0363, + "step": 1120 + }, + { + "epoch": 1.98, + "grad_norm": 0.10281267762184143, + "learning_rate": 2.6002227379522992e-05, + "loss": 0.0169, + "step": 1121 + }, + { + "epoch": 1.98, + "grad_norm": 0.3236760199069977, + "learning_rate": 2.5920485848925913e-05, + "loss": 0.0296, + "step": 1122 + }, + { + "epoch": 1.99, + "grad_norm": 0.22741632163524628, + "learning_rate": 2.5838828022582594e-05, + "loss": 0.023, + "step": 1123 + }, + { + "epoch": 1.99, + "grad_norm": 0.3826078772544861, + "learning_rate": 2.5757254184348778e-05, + "loss": 0.0744, + "step": 1124 + }, + { + "epoch": 1.99, + "grad_norm": 0.226307213306427, + "learning_rate": 2.5675764617788234e-05, + "loss": 0.0297, + "step": 1125 + }, + { + "epoch": 1.99, + "grad_norm": 0.31913021206855774, + "learning_rate": 2.5594359606171724e-05, + "loss": 0.0793, + "step": 1126 + }, + { + "epoch": 1.99, + "grad_norm": 0.2947479486465454, + "learning_rate": 2.5513039432476193e-05, + "loss": 0.1363, + "step": 1127 + }, + { + "epoch": 2.0, + "grad_norm": 0.26046791672706604, + "learning_rate": 2.5431804379383523e-05, + "loss": 0.0727, + "step": 1128 + }, + { + "epoch": 2.0, + "grad_norm": 0.1183793917298317, + "learning_rate": 2.535065472927983e-05, + "loss": 0.0139, + "step": 1129 + }, + { + "epoch": 2.0, + "grad_norm": 0.23370495438575745, + "learning_rate": 2.526959076425434e-05, + "loss": 0.0503, + "step": 1130 + }, + { + "epoch": 2.0, + "grad_norm": 0.14862821996212006, + "learning_rate": 2.5188612766098374e-05, + "loss": 0.0344, + "step": 1131 + }, + { + "epoch": 2.0, + "grad_norm": 0.15622428059577942, + "learning_rate": 2.5107721016304426e-05, + "loss": 0.0673, + "step": 1132 + }, + { + "epoch": 2.0, + "grad_norm": 0.1241433173418045, + "learning_rate": 2.502691579606523e-05, + "loss": 0.0182, + "step": 1133 + }, + { + "epoch": 2.01, + "grad_norm": 0.0929323211312294, + "learning_rate": 2.4946197386272662e-05, + "loss": 0.0139, + "step": 1134 + }, + { + "epoch": 2.01, + "grad_norm": 0.19808948040008545, + "learning_rate": 2.4865566067516893e-05, + "loss": 0.0334, + "step": 1135 + }, + { + "epoch": 2.01, + "grad_norm": 0.10557245463132858, + "learning_rate": 2.4785022120085305e-05, + "loss": 0.0161, + "step": 1136 + }, + { + "epoch": 2.01, + "eval_loss": 0.0705561563372612, + "eval_runtime": 14.6502, + "eval_samples_per_second": 32.559, + "eval_steps_per_second": 8.191, + "step": 1136 + }, + { + "epoch": 2.01, + "grad_norm": 0.2397042065858841, + "learning_rate": 2.470456582396156e-05, + "loss": 0.0217, + "step": 1137 + }, + { + "epoch": 2.01, + "grad_norm": 0.10175264626741409, + "learning_rate": 2.4624197458824695e-05, + "loss": 0.0169, + "step": 1138 + }, + { + "epoch": 2.02, + "grad_norm": 0.23746702075004578, + "learning_rate": 2.4543917304047986e-05, + "loss": 0.0349, + "step": 1139 + }, + { + "epoch": 2.02, + "grad_norm": 0.1582840532064438, + "learning_rate": 2.4463725638698183e-05, + "loss": 0.0249, + "step": 1140 + }, + { + "epoch": 2.02, + "grad_norm": 0.25601646304130554, + "learning_rate": 2.4383622741534345e-05, + "loss": 0.0246, + "step": 1141 + }, + { + "epoch": 2.02, + "grad_norm": 0.174038365483284, + "learning_rate": 2.4303608891006985e-05, + "loss": 0.0233, + "step": 1142 + }, + { + "epoch": 2.02, + "grad_norm": 0.28504037857055664, + "learning_rate": 2.422368436525711e-05, + "loss": 0.0621, + "step": 1143 + }, + { + "epoch": 2.02, + "grad_norm": 0.062362801283597946, + "learning_rate": 2.4143849442115157e-05, + "loss": 0.0086, + "step": 1144 + }, + { + "epoch": 2.03, + "grad_norm": 0.2063005268573761, + "learning_rate": 2.406410439910017e-05, + "loss": 0.0244, + "step": 1145 + }, + { + "epoch": 2.03, + "grad_norm": 0.204932302236557, + "learning_rate": 2.3984449513418688e-05, + "loss": 0.0483, + "step": 1146 + }, + { + "epoch": 2.03, + "grad_norm": 0.2597205936908722, + "learning_rate": 2.3904885061963846e-05, + "loss": 0.0302, + "step": 1147 + }, + { + "epoch": 2.03, + "grad_norm": 0.10868209600448608, + "learning_rate": 2.382541132131449e-05, + "loss": 0.0147, + "step": 1148 + }, + { + "epoch": 2.03, + "grad_norm": 0.2556356191635132, + "learning_rate": 2.374602856773404e-05, + "loss": 0.0306, + "step": 1149 + }, + { + "epoch": 2.03, + "grad_norm": 0.10802178084850311, + "learning_rate": 2.3666737077169726e-05, + "loss": 0.0154, + "step": 1150 + }, + { + "epoch": 2.04, + "grad_norm": 0.1953357756137848, + "learning_rate": 2.358753712525147e-05, + "loss": 0.0491, + "step": 1151 + }, + { + "epoch": 2.04, + "grad_norm": 0.45233485102653503, + "learning_rate": 2.350842898729099e-05, + "loss": 0.0534, + "step": 1152 + }, + { + "epoch": 2.04, + "grad_norm": 0.20398005843162537, + "learning_rate": 2.34294129382809e-05, + "loss": 0.0192, + "step": 1153 + }, + { + "epoch": 2.04, + "grad_norm": 0.06485007703304291, + "learning_rate": 2.335048925289362e-05, + "loss": 0.0069, + "step": 1154 + }, + { + "epoch": 2.04, + "grad_norm": 0.06255008280277252, + "learning_rate": 2.327165820548059e-05, + "loss": 0.0056, + "step": 1155 + }, + { + "epoch": 2.05, + "grad_norm": 0.23063381016254425, + "learning_rate": 2.3192920070071144e-05, + "loss": 0.045, + "step": 1156 + }, + { + "epoch": 2.05, + "grad_norm": 0.058569248765707016, + "learning_rate": 2.311427512037166e-05, + "loss": 0.0073, + "step": 1157 + }, + { + "epoch": 2.05, + "grad_norm": 0.154420405626297, + "learning_rate": 2.3035723629764615e-05, + "loss": 0.0148, + "step": 1158 + }, + { + "epoch": 2.05, + "grad_norm": 0.07581331580877304, + "learning_rate": 2.295726587130761e-05, + "loss": 0.0095, + "step": 1159 + }, + { + "epoch": 2.05, + "grad_norm": 0.04827249422669411, + "learning_rate": 2.287890211773238e-05, + "loss": 0.0053, + "step": 1160 + }, + { + "epoch": 2.05, + "grad_norm": 0.24536241590976715, + "learning_rate": 2.2800632641443903e-05, + "loss": 0.022, + "step": 1161 + }, + { + "epoch": 2.06, + "grad_norm": 0.041704196482896805, + "learning_rate": 2.2722457714519417e-05, + "loss": 0.0055, + "step": 1162 + }, + { + "epoch": 2.06, + "grad_norm": 0.07395283132791519, + "learning_rate": 2.2644377608707522e-05, + "loss": 0.0057, + "step": 1163 + }, + { + "epoch": 2.06, + "grad_norm": 0.2874985337257385, + "learning_rate": 2.2566392595427217e-05, + "loss": 0.0424, + "step": 1164 + }, + { + "epoch": 2.06, + "grad_norm": 0.08861269801855087, + "learning_rate": 2.2488502945766894e-05, + "loss": 0.0068, + "step": 1165 + }, + { + "epoch": 2.06, + "grad_norm": 0.11570141464471817, + "learning_rate": 2.2410708930483464e-05, + "loss": 0.0126, + "step": 1166 + }, + { + "epoch": 2.06, + "grad_norm": 0.07251165062189102, + "learning_rate": 2.2333010820001394e-05, + "loss": 0.0073, + "step": 1167 + }, + { + "epoch": 2.07, + "grad_norm": 0.17919155955314636, + "learning_rate": 2.2255408884411795e-05, + "loss": 0.0125, + "step": 1168 + }, + { + "epoch": 2.07, + "grad_norm": 0.5718133449554443, + "learning_rate": 2.217790339347146e-05, + "loss": 0.0601, + "step": 1169 + }, + { + "epoch": 2.07, + "grad_norm": 0.34238407015800476, + "learning_rate": 2.2100494616601893e-05, + "loss": 0.0269, + "step": 1170 + }, + { + "epoch": 2.07, + "grad_norm": 0.059634964913129807, + "learning_rate": 2.20231828228884e-05, + "loss": 0.0048, + "step": 1171 + }, + { + "epoch": 2.07, + "grad_norm": 0.12864679098129272, + "learning_rate": 2.194596828107921e-05, + "loss": 0.0081, + "step": 1172 + }, + { + "epoch": 2.08, + "grad_norm": 0.3110673129558563, + "learning_rate": 2.1868851259584427e-05, + "loss": 0.0372, + "step": 1173 + }, + { + "epoch": 2.08, + "grad_norm": 0.41840794682502747, + "learning_rate": 2.179183202647524e-05, + "loss": 0.0178, + "step": 1174 + }, + { + "epoch": 2.08, + "grad_norm": 0.32906386256217957, + "learning_rate": 2.171491084948278e-05, + "loss": 0.0499, + "step": 1175 + }, + { + "epoch": 2.08, + "grad_norm": 0.1345369517803192, + "learning_rate": 2.163808799599744e-05, + "loss": 0.0233, + "step": 1176 + }, + { + "epoch": 2.08, + "grad_norm": 0.45846185088157654, + "learning_rate": 2.1561363733067797e-05, + "loss": 0.0415, + "step": 1177 + }, + { + "epoch": 2.08, + "grad_norm": 0.27286604046821594, + "learning_rate": 2.1484738327399683e-05, + "loss": 0.04, + "step": 1178 + }, + { + "epoch": 2.09, + "grad_norm": 0.046739380806684494, + "learning_rate": 2.1408212045355293e-05, + "loss": 0.0034, + "step": 1179 + }, + { + "epoch": 2.09, + "grad_norm": 0.33844631910324097, + "learning_rate": 2.1331785152952243e-05, + "loss": 0.0575, + "step": 1180 + }, + { + "epoch": 2.09, + "grad_norm": 0.09294988960027695, + "learning_rate": 2.125545791586269e-05, + "loss": 0.0079, + "step": 1181 + }, + { + "epoch": 2.09, + "grad_norm": 0.18204770982265472, + "learning_rate": 2.1179230599412376e-05, + "loss": 0.0107, + "step": 1182 + }, + { + "epoch": 2.09, + "grad_norm": 0.04706999287009239, + "learning_rate": 2.1103103468579656e-05, + "loss": 0.0042, + "step": 1183 + }, + { + "epoch": 2.09, + "grad_norm": 0.022818991914391518, + "learning_rate": 2.102707678799463e-05, + "loss": 0.0021, + "step": 1184 + }, + { + "epoch": 2.1, + "grad_norm": 0.20160110294818878, + "learning_rate": 2.0951150821938275e-05, + "loss": 0.0121, + "step": 1185 + }, + { + "epoch": 2.1, + "grad_norm": 0.11171355098485947, + "learning_rate": 2.087532583434139e-05, + "loss": 0.0058, + "step": 1186 + }, + { + "epoch": 2.1, + "grad_norm": 0.08429128676652908, + "learning_rate": 2.0799602088783837e-05, + "loss": 0.0045, + "step": 1187 + }, + { + "epoch": 2.1, + "grad_norm": 0.2696090638637543, + "learning_rate": 2.0723979848493475e-05, + "loss": 0.0559, + "step": 1188 + }, + { + "epoch": 2.1, + "grad_norm": 0.1012461930513382, + "learning_rate": 2.064845937634533e-05, + "loss": 0.0042, + "step": 1189 + }, + { + "epoch": 2.11, + "grad_norm": 0.5466766953468323, + "learning_rate": 2.0573040934860717e-05, + "loss": 0.0582, + "step": 1190 + }, + { + "epoch": 2.11, + "grad_norm": 0.21249692142009735, + "learning_rate": 2.0497724786206184e-05, + "loss": 0.0079, + "step": 1191 + }, + { + "epoch": 2.11, + "grad_norm": 0.11077786982059479, + "learning_rate": 2.04225111921928e-05, + "loss": 0.0047, + "step": 1192 + }, + { + "epoch": 2.11, + "grad_norm": 0.11214567720890045, + "learning_rate": 2.0347400414275057e-05, + "loss": 0.0087, + "step": 1193 + }, + { + "epoch": 2.11, + "grad_norm": 0.16024614870548248, + "learning_rate": 2.027239271355005e-05, + "loss": 0.0126, + "step": 1194 + }, + { + "epoch": 2.11, + "grad_norm": 0.06119425222277641, + "learning_rate": 2.0197488350756616e-05, + "loss": 0.0033, + "step": 1195 + }, + { + "epoch": 2.12, + "grad_norm": 0.40313616394996643, + "learning_rate": 2.0122687586274296e-05, + "loss": 0.0365, + "step": 1196 + }, + { + "epoch": 2.12, + "grad_norm": 0.5600349307060242, + "learning_rate": 2.0047990680122585e-05, + "loss": 0.0398, + "step": 1197 + }, + { + "epoch": 2.12, + "grad_norm": 0.32200881838798523, + "learning_rate": 1.9973397891959893e-05, + "loss": 0.0526, + "step": 1198 + }, + { + "epoch": 2.12, + "grad_norm": 0.5268571972846985, + "learning_rate": 1.98989094810827e-05, + "loss": 0.0485, + "step": 1199 + }, + { + "epoch": 2.12, + "grad_norm": 0.1170785129070282, + "learning_rate": 1.98245257064247e-05, + "loss": 0.0095, + "step": 1200 + }, + { + "epoch": 2.12, + "grad_norm": 0.045479219406843185, + "learning_rate": 1.9750246826555802e-05, + "loss": 0.004, + "step": 1201 + }, + { + "epoch": 2.13, + "grad_norm": 0.13502654433250427, + "learning_rate": 1.9676073099681337e-05, + "loss": 0.0055, + "step": 1202 + }, + { + "epoch": 2.13, + "grad_norm": 0.3733961880207062, + "learning_rate": 1.9602004783641053e-05, + "loss": 0.0447, + "step": 1203 + }, + { + "epoch": 2.13, + "grad_norm": 0.5178970694541931, + "learning_rate": 1.952804213590829e-05, + "loss": 0.1063, + "step": 1204 + }, + { + "epoch": 2.13, + "grad_norm": 0.1365629881620407, + "learning_rate": 1.945418541358911e-05, + "loss": 0.0098, + "step": 1205 + }, + { + "epoch": 2.13, + "grad_norm": 0.17394568026065826, + "learning_rate": 1.9380434873421294e-05, + "loss": 0.0074, + "step": 1206 + }, + { + "epoch": 2.14, + "grad_norm": 0.31278324127197266, + "learning_rate": 1.9306790771773574e-05, + "loss": 0.0437, + "step": 1207 + }, + { + "epoch": 2.14, + "grad_norm": 0.27542349696159363, + "learning_rate": 1.9233253364644654e-05, + "loss": 0.0186, + "step": 1208 + }, + { + "epoch": 2.14, + "grad_norm": 0.2890969514846802, + "learning_rate": 1.9159822907662335e-05, + "loss": 0.0106, + "step": 1209 + }, + { + "epoch": 2.14, + "grad_norm": 0.051906730979681015, + "learning_rate": 1.9086499656082686e-05, + "loss": 0.0036, + "step": 1210 + }, + { + "epoch": 2.14, + "grad_norm": 0.07343268394470215, + "learning_rate": 1.9013283864789106e-05, + "loss": 0.0052, + "step": 1211 + }, + { + "epoch": 2.14, + "grad_norm": 0.20002156496047974, + "learning_rate": 1.894017578829141e-05, + "loss": 0.018, + "step": 1212 + }, + { + "epoch": 2.15, + "grad_norm": 0.30538609623908997, + "learning_rate": 1.8867175680725004e-05, + "loss": 0.0472, + "step": 1213 + }, + { + "epoch": 2.15, + "grad_norm": 0.060167331248521805, + "learning_rate": 1.879428379584995e-05, + "loss": 0.0051, + "step": 1214 + }, + { + "epoch": 2.15, + "grad_norm": 0.044497113674879074, + "learning_rate": 1.872150038705015e-05, + "loss": 0.0038, + "step": 1215 + }, + { + "epoch": 2.15, + "grad_norm": 0.14290861785411835, + "learning_rate": 1.8648825707332435e-05, + "loss": 0.007, + "step": 1216 + }, + { + "epoch": 2.15, + "grad_norm": 0.04411248490214348, + "learning_rate": 1.857626000932562e-05, + "loss": 0.0029, + "step": 1217 + }, + { + "epoch": 2.15, + "grad_norm": 0.2475292682647705, + "learning_rate": 1.850380354527972e-05, + "loss": 0.0192, + "step": 1218 + }, + { + "epoch": 2.16, + "grad_norm": 0.1666267067193985, + "learning_rate": 1.8431456567065e-05, + "loss": 0.0076, + "step": 1219 + }, + { + "epoch": 2.16, + "grad_norm": 0.1360848993062973, + "learning_rate": 1.835921932617119e-05, + "loss": 0.0062, + "step": 1220 + }, + { + "epoch": 2.16, + "grad_norm": 0.24183595180511475, + "learning_rate": 1.828709207370656e-05, + "loss": 0.0141, + "step": 1221 + }, + { + "epoch": 2.16, + "grad_norm": 0.35370147228240967, + "learning_rate": 1.821507506039693e-05, + "loss": 0.0409, + "step": 1222 + }, + { + "epoch": 2.16, + "grad_norm": 0.047938745468854904, + "learning_rate": 1.814316853658503e-05, + "loss": 0.0044, + "step": 1223 + }, + { + "epoch": 2.17, + "grad_norm": 0.05063376575708389, + "learning_rate": 1.8071372752229497e-05, + "loss": 0.0039, + "step": 1224 + }, + { + "epoch": 2.17, + "grad_norm": 0.2387128621339798, + "learning_rate": 1.7999687956903953e-05, + "loss": 0.044, + "step": 1225 + }, + { + "epoch": 2.17, + "grad_norm": 0.04712311923503876, + "learning_rate": 1.79281143997963e-05, + "loss": 0.0039, + "step": 1226 + }, + { + "epoch": 2.17, + "grad_norm": 0.057547133415937424, + "learning_rate": 1.7856652329707622e-05, + "loss": 0.0053, + "step": 1227 + }, + { + "epoch": 2.17, + "grad_norm": 0.0631386935710907, + "learning_rate": 1.778530199505159e-05, + "loss": 0.0026, + "step": 1228 + }, + { + "epoch": 2.17, + "grad_norm": 0.43415573239326477, + "learning_rate": 1.7714063643853422e-05, + "loss": 0.0358, + "step": 1229 + }, + { + "epoch": 2.18, + "grad_norm": 0.22691461443901062, + "learning_rate": 1.7642937523749038e-05, + "loss": 0.0341, + "step": 1230 + }, + { + "epoch": 2.18, + "grad_norm": 0.13253571093082428, + "learning_rate": 1.7571923881984237e-05, + "loss": 0.0069, + "step": 1231 + }, + { + "epoch": 2.18, + "grad_norm": 0.3073631227016449, + "learning_rate": 1.7501022965413822e-05, + "loss": 0.0199, + "step": 1232 + }, + { + "epoch": 2.18, + "grad_norm": 0.341001957654953, + "learning_rate": 1.7430235020500756e-05, + "loss": 0.0597, + "step": 1233 + }, + { + "epoch": 2.18, + "grad_norm": 0.4905489385128021, + "learning_rate": 1.735956029331532e-05, + "loss": 0.0349, + "step": 1234 + }, + { + "epoch": 2.18, + "grad_norm": 0.16399207711219788, + "learning_rate": 1.7288999029534176e-05, + "loss": 0.0099, + "step": 1235 + }, + { + "epoch": 2.19, + "grad_norm": 0.033206094056367874, + "learning_rate": 1.7218551474439587e-05, + "loss": 0.0025, + "step": 1236 + }, + { + "epoch": 2.19, + "grad_norm": 0.08004510402679443, + "learning_rate": 1.7148217872918582e-05, + "loss": 0.0061, + "step": 1237 + }, + { + "epoch": 2.19, + "grad_norm": 0.3009154498577118, + "learning_rate": 1.707799846946201e-05, + "loss": 0.0448, + "step": 1238 + }, + { + "epoch": 2.19, + "grad_norm": 0.44372421503067017, + "learning_rate": 1.7007893508163818e-05, + "loss": 0.0759, + "step": 1239 + }, + { + "epoch": 2.19, + "grad_norm": 0.08692881464958191, + "learning_rate": 1.6937903232720077e-05, + "loss": 0.0053, + "step": 1240 + }, + { + "epoch": 2.2, + "grad_norm": 0.505023717880249, + "learning_rate": 1.6868027886428194e-05, + "loss": 0.0654, + "step": 1241 + }, + { + "epoch": 2.2, + "grad_norm": 0.05224711447954178, + "learning_rate": 1.6798267712186123e-05, + "loss": 0.0035, + "step": 1242 + }, + { + "epoch": 2.2, + "grad_norm": 0.27778536081314087, + "learning_rate": 1.672862295249138e-05, + "loss": 0.0119, + "step": 1243 + }, + { + "epoch": 2.2, + "grad_norm": 0.19643987715244293, + "learning_rate": 1.6659093849440356e-05, + "loss": 0.0096, + "step": 1244 + }, + { + "epoch": 2.2, + "grad_norm": 0.40564844012260437, + "learning_rate": 1.6589680644727344e-05, + "loss": 0.0435, + "step": 1245 + }, + { + "epoch": 2.2, + "grad_norm": 0.11844424903392792, + "learning_rate": 1.6520383579643767e-05, + "loss": 0.0087, + "step": 1246 + }, + { + "epoch": 2.21, + "grad_norm": 0.23236432671546936, + "learning_rate": 1.645120289507738e-05, + "loss": 0.0208, + "step": 1247 + }, + { + "epoch": 2.21, + "grad_norm": 0.15374314785003662, + "learning_rate": 1.638213883151129e-05, + "loss": 0.007, + "step": 1248 + }, + { + "epoch": 2.21, + "grad_norm": 0.13705483078956604, + "learning_rate": 1.631319162902331e-05, + "loss": 0.0111, + "step": 1249 + }, + { + "epoch": 2.21, + "grad_norm": 0.3826940953731537, + "learning_rate": 1.6244361527284952e-05, + "loss": 0.0319, + "step": 1250 + }, + { + "epoch": 2.21, + "grad_norm": 0.061355918645858765, + "learning_rate": 1.617564876556067e-05, + "loss": 0.0063, + "step": 1251 + }, + { + "epoch": 2.21, + "grad_norm": 0.08094111829996109, + "learning_rate": 1.6107053582707082e-05, + "loss": 0.0075, + "step": 1252 + }, + { + "epoch": 2.22, + "grad_norm": 0.3642162084579468, + "learning_rate": 1.6038576217172018e-05, + "loss": 0.0442, + "step": 1253 + }, + { + "epoch": 2.22, + "grad_norm": 0.03249219432473183, + "learning_rate": 1.597021690699382e-05, + "loss": 0.0033, + "step": 1254 + }, + { + "epoch": 2.22, + "grad_norm": 0.09141464531421661, + "learning_rate": 1.5901975889800386e-05, + "loss": 0.0089, + "step": 1255 + }, + { + "epoch": 2.22, + "grad_norm": 0.3110590875148773, + "learning_rate": 1.5833853402808435e-05, + "loss": 0.021, + "step": 1256 + }, + { + "epoch": 2.22, + "grad_norm": 0.1551726758480072, + "learning_rate": 1.5765849682822686e-05, + "loss": 0.0088, + "step": 1257 + }, + { + "epoch": 2.23, + "grad_norm": 0.018826603889465332, + "learning_rate": 1.5697964966234945e-05, + "loss": 0.0017, + "step": 1258 + }, + { + "epoch": 2.23, + "grad_norm": 0.3857581913471222, + "learning_rate": 1.563019948902342e-05, + "loss": 0.0241, + "step": 1259 + }, + { + "epoch": 2.23, + "grad_norm": 0.05305016040802002, + "learning_rate": 1.5562553486751743e-05, + "loss": 0.0041, + "step": 1260 + }, + { + "epoch": 2.23, + "grad_norm": 0.28876179456710815, + "learning_rate": 1.549502719456827e-05, + "loss": 0.0202, + "step": 1261 + }, + { + "epoch": 2.23, + "grad_norm": 0.17455187439918518, + "learning_rate": 1.5427620847205236e-05, + "loss": 0.0172, + "step": 1262 + }, + { + "epoch": 2.23, + "grad_norm": 0.21817126870155334, + "learning_rate": 1.5360334678977933e-05, + "loss": 0.0145, + "step": 1263 + }, + { + "epoch": 2.24, + "grad_norm": 0.0744200348854065, + "learning_rate": 1.5293168923783857e-05, + "loss": 0.0048, + "step": 1264 + }, + { + "epoch": 2.24, + "grad_norm": 0.07314429432153702, + "learning_rate": 1.5226123815101951e-05, + "loss": 0.004, + "step": 1265 + }, + { + "epoch": 2.24, + "grad_norm": 0.04507233574986458, + "learning_rate": 1.5159199585991746e-05, + "loss": 0.0033, + "step": 1266 + }, + { + "epoch": 2.24, + "grad_norm": 0.2984592616558075, + "learning_rate": 1.5092396469092618e-05, + "loss": 0.0261, + "step": 1267 + }, + { + "epoch": 2.24, + "grad_norm": 0.04742783308029175, + "learning_rate": 1.5025714696622934e-05, + "loss": 0.002, + "step": 1268 + }, + { + "epoch": 2.25, + "grad_norm": 1.086358904838562, + "learning_rate": 1.4959154500379213e-05, + "loss": 0.0503, + "step": 1269 + }, + { + "epoch": 2.25, + "grad_norm": 0.2948329448699951, + "learning_rate": 1.4892716111735378e-05, + "loss": 0.0511, + "step": 1270 + }, + { + "epoch": 2.25, + "grad_norm": 0.0721355527639389, + "learning_rate": 1.4826399761641907e-05, + "loss": 0.0044, + "step": 1271 + }, + { + "epoch": 2.25, + "grad_norm": 0.5475502014160156, + "learning_rate": 1.4760205680625083e-05, + "loss": 0.0624, + "step": 1272 + }, + { + "epoch": 2.25, + "grad_norm": 0.06849555671215057, + "learning_rate": 1.4694134098786183e-05, + "loss": 0.0043, + "step": 1273 + }, + { + "epoch": 2.25, + "grad_norm": 0.14424243569374084, + "learning_rate": 1.4628185245800568e-05, + "loss": 0.0912, + "step": 1274 + }, + { + "epoch": 2.26, + "grad_norm": 0.026207152754068375, + "learning_rate": 1.4562359350917054e-05, + "loss": 0.0018, + "step": 1275 + }, + { + "epoch": 2.26, + "grad_norm": 0.13350637257099152, + "learning_rate": 1.4496656642957024e-05, + "loss": 0.0084, + "step": 1276 + }, + { + "epoch": 2.26, + "grad_norm": 0.19830527901649475, + "learning_rate": 1.443107735031361e-05, + "loss": 0.0391, + "step": 1277 + }, + { + "epoch": 2.26, + "grad_norm": 0.05153921991586685, + "learning_rate": 1.4365621700950988e-05, + "loss": 0.0024, + "step": 1278 + }, + { + "epoch": 2.26, + "eval_loss": 0.09353172779083252, + "eval_runtime": 14.6748, + "eval_samples_per_second": 32.505, + "eval_steps_per_second": 8.177, + "step": 1278 + }, + { + "epoch": 2.26, + "grad_norm": 0.5911564230918884, + "learning_rate": 1.4300289922403442e-05, + "loss": 0.0312, + "step": 1279 + }, + { + "epoch": 2.26, + "grad_norm": 0.03525357320904732, + "learning_rate": 1.423508224177474e-05, + "loss": 0.0027, + "step": 1280 + }, + { + "epoch": 2.27, + "grad_norm": 0.41011494398117065, + "learning_rate": 1.4169998885737257e-05, + "loss": 0.0307, + "step": 1281 + }, + { + "epoch": 2.27, + "grad_norm": 0.28563350439071655, + "learning_rate": 1.4105040080531162e-05, + "loss": 0.0218, + "step": 1282 + }, + { + "epoch": 2.27, + "grad_norm": 0.17765937745571136, + "learning_rate": 1.4040206051963679e-05, + "loss": 0.0084, + "step": 1283 + }, + { + "epoch": 2.27, + "grad_norm": 0.38896211981773376, + "learning_rate": 1.3975497025408285e-05, + "loss": 0.0177, + "step": 1284 + }, + { + "epoch": 2.27, + "grad_norm": 0.13160689175128937, + "learning_rate": 1.3910913225803945e-05, + "loss": 0.0062, + "step": 1285 + }, + { + "epoch": 2.28, + "grad_norm": 0.03359553962945938, + "learning_rate": 1.384645487765432e-05, + "loss": 0.002, + "step": 1286 + }, + { + "epoch": 2.28, + "grad_norm": 0.12169768661260605, + "learning_rate": 1.3782122205026959e-05, + "loss": 0.0064, + "step": 1287 + }, + { + "epoch": 2.28, + "grad_norm": 0.13100889325141907, + "learning_rate": 1.371791543155253e-05, + "loss": 0.0066, + "step": 1288 + }, + { + "epoch": 2.28, + "grad_norm": 0.4983580410480499, + "learning_rate": 1.3653834780424114e-05, + "loss": 0.017, + "step": 1289 + }, + { + "epoch": 2.28, + "grad_norm": 0.052320413291454315, + "learning_rate": 1.35898804743963e-05, + "loss": 0.0038, + "step": 1290 + }, + { + "epoch": 2.28, + "grad_norm": 0.25454479455947876, + "learning_rate": 1.3526052735784539e-05, + "loss": 0.0195, + "step": 1291 + }, + { + "epoch": 2.29, + "grad_norm": 0.29201680421829224, + "learning_rate": 1.3462351786464284e-05, + "loss": 0.0573, + "step": 1292 + }, + { + "epoch": 2.29, + "grad_norm": 0.5512139797210693, + "learning_rate": 1.3398777847870236e-05, + "loss": 0.0327, + "step": 1293 + }, + { + "epoch": 2.29, + "grad_norm": 0.025316596031188965, + "learning_rate": 1.3335331140995628e-05, + "loss": 0.002, + "step": 1294 + }, + { + "epoch": 2.29, + "grad_norm": 0.4447712004184723, + "learning_rate": 1.3272011886391366e-05, + "loss": 0.0725, + "step": 1295 + }, + { + "epoch": 2.29, + "grad_norm": 0.20218294858932495, + "learning_rate": 1.3208820304165365e-05, + "loss": 0.0086, + "step": 1296 + }, + { + "epoch": 2.29, + "grad_norm": 0.24967634677886963, + "learning_rate": 1.314575661398168e-05, + "loss": 0.0172, + "step": 1297 + }, + { + "epoch": 2.3, + "grad_norm": 0.5479951500892639, + "learning_rate": 1.3082821035059812e-05, + "loss": 0.0272, + "step": 1298 + }, + { + "epoch": 2.3, + "grad_norm": 0.04600968584418297, + "learning_rate": 1.3020013786173946e-05, + "loss": 0.0037, + "step": 1299 + }, + { + "epoch": 2.3, + "grad_norm": 0.5146220922470093, + "learning_rate": 1.2957335085652129e-05, + "loss": 0.017, + "step": 1300 + }, + { + "epoch": 2.3, + "grad_norm": 0.19116492569446564, + "learning_rate": 1.289478515137561e-05, + "loss": 0.0113, + "step": 1301 + }, + { + "epoch": 2.3, + "grad_norm": 0.04863365367054939, + "learning_rate": 1.283236420077798e-05, + "loss": 0.0035, + "step": 1302 + }, + { + "epoch": 2.31, + "grad_norm": 0.3398780822753906, + "learning_rate": 1.2770072450844462e-05, + "loss": 0.0237, + "step": 1303 + }, + { + "epoch": 2.31, + "grad_norm": 0.04249873757362366, + "learning_rate": 1.2707910118111194e-05, + "loss": 0.0025, + "step": 1304 + }, + { + "epoch": 2.31, + "grad_norm": 0.2666451632976532, + "learning_rate": 1.2645877418664392e-05, + "loss": 0.0115, + "step": 1305 + }, + { + "epoch": 2.31, + "grad_norm": 0.016322264447808266, + "learning_rate": 1.25839745681397e-05, + "loss": 0.001, + "step": 1306 + }, + { + "epoch": 2.31, + "grad_norm": 0.4568755328655243, + "learning_rate": 1.2522201781721338e-05, + "loss": 0.0169, + "step": 1307 + }, + { + "epoch": 2.31, + "grad_norm": 0.39883822202682495, + "learning_rate": 1.2460559274141409e-05, + "loss": 0.0191, + "step": 1308 + }, + { + "epoch": 2.32, + "grad_norm": 0.1871948093175888, + "learning_rate": 1.2399047259679181e-05, + "loss": 0.014, + "step": 1309 + }, + { + "epoch": 2.32, + "grad_norm": 0.2893843352794647, + "learning_rate": 1.2337665952160266e-05, + "loss": 0.0124, + "step": 1310 + }, + { + "epoch": 2.32, + "grad_norm": 0.08608860522508621, + "learning_rate": 1.227641556495595e-05, + "loss": 0.0059, + "step": 1311 + }, + { + "epoch": 2.32, + "grad_norm": 0.06062796339392662, + "learning_rate": 1.2215296310982404e-05, + "loss": 0.0043, + "step": 1312 + }, + { + "epoch": 2.32, + "grad_norm": 0.022510148584842682, + "learning_rate": 1.2154308402699933e-05, + "loss": 0.0017, + "step": 1313 + }, + { + "epoch": 2.32, + "grad_norm": 0.0417412631213665, + "learning_rate": 1.2093452052112309e-05, + "loss": 0.003, + "step": 1314 + }, + { + "epoch": 2.33, + "grad_norm": 0.015822090208530426, + "learning_rate": 1.203272747076598e-05, + "loss": 0.0012, + "step": 1315 + }, + { + "epoch": 2.33, + "grad_norm": 0.01757388934493065, + "learning_rate": 1.1972134869749324e-05, + "loss": 0.0011, + "step": 1316 + }, + { + "epoch": 2.33, + "grad_norm": 0.057761795818805695, + "learning_rate": 1.191167445969193e-05, + "loss": 0.0037, + "step": 1317 + }, + { + "epoch": 2.33, + "grad_norm": 0.04784402251243591, + "learning_rate": 1.1851346450763879e-05, + "loss": 0.0037, + "step": 1318 + }, + { + "epoch": 2.33, + "grad_norm": 0.01795756258070469, + "learning_rate": 1.179115105267502e-05, + "loss": 0.0014, + "step": 1319 + }, + { + "epoch": 2.34, + "grad_norm": 0.26585298776626587, + "learning_rate": 1.1731088474674234e-05, + "loss": 0.0103, + "step": 1320 + }, + { + "epoch": 2.34, + "grad_norm": 0.02198723517358303, + "learning_rate": 1.1671158925548625e-05, + "loss": 0.0013, + "step": 1321 + }, + { + "epoch": 2.34, + "grad_norm": 0.25198501348495483, + "learning_rate": 1.161136261362296e-05, + "loss": 0.0118, + "step": 1322 + }, + { + "epoch": 2.34, + "grad_norm": 0.09213818609714508, + "learning_rate": 1.1551699746758787e-05, + "loss": 0.0045, + "step": 1323 + }, + { + "epoch": 2.34, + "grad_norm": 0.6889921426773071, + "learning_rate": 1.1492170532353813e-05, + "loss": 0.0803, + "step": 1324 + }, + { + "epoch": 2.34, + "grad_norm": 0.27474120259284973, + "learning_rate": 1.1432775177341165e-05, + "loss": 0.0163, + "step": 1325 + }, + { + "epoch": 2.35, + "grad_norm": 0.03471897542476654, + "learning_rate": 1.1373513888188563e-05, + "loss": 0.0025, + "step": 1326 + }, + { + "epoch": 2.35, + "grad_norm": 0.188897967338562, + "learning_rate": 1.1314386870897792e-05, + "loss": 0.0059, + "step": 1327 + }, + { + "epoch": 2.35, + "grad_norm": 0.2922353446483612, + "learning_rate": 1.1255394331003855e-05, + "loss": 0.0118, + "step": 1328 + }, + { + "epoch": 2.35, + "grad_norm": 0.06120475009083748, + "learning_rate": 1.1196536473574276e-05, + "loss": 0.0038, + "step": 1329 + }, + { + "epoch": 2.35, + "grad_norm": 0.06912042200565338, + "learning_rate": 1.11378135032084e-05, + "loss": 0.0022, + "step": 1330 + }, + { + "epoch": 2.35, + "grad_norm": 0.12412311881780624, + "learning_rate": 1.1079225624036688e-05, + "loss": 0.0074, + "step": 1331 + }, + { + "epoch": 2.36, + "grad_norm": 0.16317041218280792, + "learning_rate": 1.1020773039720017e-05, + "loss": 0.0061, + "step": 1332 + }, + { + "epoch": 2.36, + "grad_norm": 0.025967106223106384, + "learning_rate": 1.0962455953448953e-05, + "loss": 0.0018, + "step": 1333 + }, + { + "epoch": 2.36, + "grad_norm": 0.22429053485393524, + "learning_rate": 1.090427456794304e-05, + "loss": 0.0641, + "step": 1334 + }, + { + "epoch": 2.36, + "grad_norm": 0.3978908658027649, + "learning_rate": 1.0846229085450099e-05, + "loss": 0.0607, + "step": 1335 + }, + { + "epoch": 2.36, + "grad_norm": 0.08564822375774384, + "learning_rate": 1.0788319707745526e-05, + "loss": 0.0052, + "step": 1336 + }, + { + "epoch": 2.37, + "grad_norm": 0.05615312233567238, + "learning_rate": 1.0730546636131622e-05, + "loss": 0.0031, + "step": 1337 + }, + { + "epoch": 2.37, + "grad_norm": 0.5224884152412415, + "learning_rate": 1.0672910071436865e-05, + "loss": 0.0297, + "step": 1338 + }, + { + "epoch": 2.37, + "grad_norm": 0.034977857023477554, + "learning_rate": 1.0615410214015187e-05, + "loss": 0.002, + "step": 1339 + }, + { + "epoch": 2.37, + "grad_norm": 0.09802459180355072, + "learning_rate": 1.0558047263745297e-05, + "loss": 0.0038, + "step": 1340 + }, + { + "epoch": 2.37, + "grad_norm": 0.18704549968242645, + "learning_rate": 1.0500821420030049e-05, + "loss": 0.0055, + "step": 1341 + }, + { + "epoch": 2.37, + "grad_norm": 0.2677512764930725, + "learning_rate": 1.0443732881795615e-05, + "loss": 0.0079, + "step": 1342 + }, + { + "epoch": 2.38, + "grad_norm": 0.6220883727073669, + "learning_rate": 1.0386781847490951e-05, + "loss": 0.0287, + "step": 1343 + }, + { + "epoch": 2.38, + "grad_norm": 0.0934520959854126, + "learning_rate": 1.0329968515086968e-05, + "loss": 0.0031, + "step": 1344 + }, + { + "epoch": 2.38, + "grad_norm": 0.2891664505004883, + "learning_rate": 1.0273293082075913e-05, + "loss": 0.0116, + "step": 1345 + }, + { + "epoch": 2.38, + "grad_norm": 0.24885062873363495, + "learning_rate": 1.0216755745470702e-05, + "loss": 0.0203, + "step": 1346 + }, + { + "epoch": 2.38, + "grad_norm": 0.08252017945051193, + "learning_rate": 1.016035670180417e-05, + "loss": 0.0051, + "step": 1347 + }, + { + "epoch": 2.38, + "grad_norm": 0.07532214373350143, + "learning_rate": 1.0104096147128455e-05, + "loss": 0.0048, + "step": 1348 + }, + { + "epoch": 2.39, + "grad_norm": 0.6588844060897827, + "learning_rate": 1.0047974277014267e-05, + "loss": 0.0393, + "step": 1349 + }, + { + "epoch": 2.39, + "grad_norm": 0.6048510074615479, + "learning_rate": 9.991991286550207e-06, + "loss": 0.0769, + "step": 1350 + }, + { + "epoch": 2.39, + "grad_norm": 0.5653522610664368, + "learning_rate": 9.936147370342164e-06, + "loss": 0.0397, + "step": 1351 + }, + { + "epoch": 2.39, + "grad_norm": 0.3729051649570465, + "learning_rate": 9.880442722512517e-06, + "loss": 0.0588, + "step": 1352 + }, + { + "epoch": 2.39, + "grad_norm": 0.17098484933376312, + "learning_rate": 9.824877536699584e-06, + "loss": 0.0085, + "step": 1353 + }, + { + "epoch": 2.4, + "grad_norm": 0.40318816900253296, + "learning_rate": 9.769452006056855e-06, + "loss": 0.0921, + "step": 1354 + }, + { + "epoch": 2.4, + "grad_norm": 0.25994887948036194, + "learning_rate": 9.71416632325235e-06, + "loss": 0.024, + "step": 1355 + }, + { + "epoch": 2.4, + "grad_norm": 0.10480910539627075, + "learning_rate": 9.659020680468e-06, + "loss": 0.0034, + "step": 1356 + }, + { + "epoch": 2.4, + "grad_norm": 0.3533475399017334, + "learning_rate": 9.604015269398876e-06, + "loss": 0.0273, + "step": 1357 + }, + { + "epoch": 2.4, + "grad_norm": 0.3021422326564789, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0189, + "step": 1358 + }, + { + "epoch": 2.4, + "grad_norm": 0.06169616058468819, + "learning_rate": 9.49442590674876e-06, + "loss": 0.0038, + "step": 1359 + }, + { + "epoch": 2.41, + "grad_norm": 0.30749934911727905, + "learning_rate": 9.439842336117954e-06, + "loss": 0.0244, + "step": 1360 + }, + { + "epoch": 2.41, + "grad_norm": 0.30821648240089417, + "learning_rate": 9.385399759101482e-06, + "loss": 0.0239, + "step": 1361 + }, + { + "epoch": 2.41, + "grad_norm": 0.8102036714553833, + "learning_rate": 9.331098364950453e-06, + "loss": 0.0128, + "step": 1362 + }, + { + "epoch": 2.41, + "grad_norm": 0.09860553592443466, + "learning_rate": 9.276938342425246e-06, + "loss": 0.0063, + "step": 1363 + }, + { + "epoch": 2.41, + "grad_norm": 0.2352733463048935, + "learning_rate": 9.222919879794772e-06, + "loss": 0.0073, + "step": 1364 + }, + { + "epoch": 2.41, + "grad_norm": 0.3346589505672455, + "learning_rate": 9.169043164835867e-06, + "loss": 0.009, + "step": 1365 + }, + { + "epoch": 2.42, + "grad_norm": 0.06581712514162064, + "learning_rate": 9.115308384832637e-06, + "loss": 0.003, + "step": 1366 + }, + { + "epoch": 2.42, + "grad_norm": 0.22018809616565704, + "learning_rate": 9.061715726575825e-06, + "loss": 0.0117, + "step": 1367 + }, + { + "epoch": 2.42, + "grad_norm": 0.04437447339296341, + "learning_rate": 9.008265376362079e-06, + "loss": 0.0036, + "step": 1368 + }, + { + "epoch": 2.42, + "grad_norm": 0.03423381224274635, + "learning_rate": 8.954957519993401e-06, + "loss": 0.0016, + "step": 1369 + }, + { + "epoch": 2.42, + "grad_norm": 0.09860479086637497, + "learning_rate": 8.901792342776437e-06, + "loss": 0.0063, + "step": 1370 + }, + { + "epoch": 2.43, + "grad_norm": 0.5376794934272766, + "learning_rate": 8.848770029521875e-06, + "loss": 0.0466, + "step": 1371 + }, + { + "epoch": 2.43, + "grad_norm": 0.06586974114179611, + "learning_rate": 8.795890764543818e-06, + "loss": 0.0036, + "step": 1372 + }, + { + "epoch": 2.43, + "grad_norm": 0.6043423414230347, + "learning_rate": 8.74315473165902e-06, + "loss": 0.041, + "step": 1373 + }, + { + "epoch": 2.43, + "grad_norm": 0.3147992789745331, + "learning_rate": 8.690562114186423e-06, + "loss": 0.0417, + "step": 1374 + }, + { + "epoch": 2.43, + "grad_norm": 0.02375558577477932, + "learning_rate": 8.638113094946381e-06, + "loss": 0.0018, + "step": 1375 + }, + { + "epoch": 2.43, + "grad_norm": 0.34309032559394836, + "learning_rate": 8.585807856260108e-06, + "loss": 0.0535, + "step": 1376 + }, + { + "epoch": 2.44, + "grad_norm": 0.37610551714897156, + "learning_rate": 8.533646579949034e-06, + "loss": 0.0802, + "step": 1377 + }, + { + "epoch": 2.44, + "grad_norm": 0.2780384421348572, + "learning_rate": 8.481629447334066e-06, + "loss": 0.0135, + "step": 1378 + }, + { + "epoch": 2.44, + "grad_norm": 0.5748811364173889, + "learning_rate": 8.429756639235136e-06, + "loss": 0.0205, + "step": 1379 + }, + { + "epoch": 2.44, + "grad_norm": 0.18549934029579163, + "learning_rate": 8.37802833597045e-06, + "loss": 0.0081, + "step": 1380 + }, + { + "epoch": 2.44, + "grad_norm": 0.5937396287918091, + "learning_rate": 8.326444717355875e-06, + "loss": 0.0478, + "step": 1381 + }, + { + "epoch": 2.44, + "grad_norm": 0.02896355651319027, + "learning_rate": 8.275005962704346e-06, + "loss": 0.0022, + "step": 1382 + }, + { + "epoch": 2.45, + "grad_norm": 0.5288581252098083, + "learning_rate": 8.223712250825216e-06, + "loss": 0.0228, + "step": 1383 + }, + { + "epoch": 2.45, + "grad_norm": 0.05525179207324982, + "learning_rate": 8.172563760023665e-06, + "loss": 0.0039, + "step": 1384 + }, + { + "epoch": 2.45, + "grad_norm": 0.31956082582473755, + "learning_rate": 8.121560668100064e-06, + "loss": 0.0467, + "step": 1385 + }, + { + "epoch": 2.45, + "grad_norm": 0.3030397593975067, + "learning_rate": 8.070703152349335e-06, + "loss": 0.0457, + "step": 1386 + }, + { + "epoch": 2.45, + "grad_norm": 0.46824321150779724, + "learning_rate": 8.019991389560349e-06, + "loss": 0.0565, + "step": 1387 + }, + { + "epoch": 2.46, + "grad_norm": 0.3213633894920349, + "learning_rate": 7.969425556015326e-06, + "loss": 0.0139, + "step": 1388 + }, + { + "epoch": 2.46, + "grad_norm": 0.06180056184530258, + "learning_rate": 7.91900582748923e-06, + "loss": 0.0033, + "step": 1389 + }, + { + "epoch": 2.46, + "grad_norm": 0.4502154588699341, + "learning_rate": 7.868732379249121e-06, + "loss": 0.0215, + "step": 1390 + }, + { + "epoch": 2.46, + "grad_norm": 0.35737964510917664, + "learning_rate": 7.818605386053574e-06, + "loss": 0.0273, + "step": 1391 + }, + { + "epoch": 2.46, + "grad_norm": 0.2905728220939636, + "learning_rate": 7.768625022152038e-06, + "loss": 0.0083, + "step": 1392 + }, + { + "epoch": 2.46, + "grad_norm": 0.4154905378818512, + "learning_rate": 7.718791461284302e-06, + "loss": 0.0379, + "step": 1393 + }, + { + "epoch": 2.47, + "grad_norm": 0.25050264596939087, + "learning_rate": 7.669104876679795e-06, + "loss": 0.0142, + "step": 1394 + }, + { + "epoch": 2.47, + "grad_norm": 0.05501753091812134, + "learning_rate": 7.619565441057075e-06, + "loss": 0.0046, + "step": 1395 + }, + { + "epoch": 2.47, + "grad_norm": 0.14321210980415344, + "learning_rate": 7.570173326623154e-06, + "loss": 0.0058, + "step": 1396 + }, + { + "epoch": 2.47, + "grad_norm": 0.22147847712039948, + "learning_rate": 7.5209287050729385e-06, + "loss": 0.0244, + "step": 1397 + }, + { + "epoch": 2.47, + "grad_norm": 0.05153148993849754, + "learning_rate": 7.4718317475886534e-06, + "loss": 0.0055, + "step": 1398 + }, + { + "epoch": 2.48, + "grad_norm": 0.3048984706401825, + "learning_rate": 7.422882624839178e-06, + "loss": 0.0162, + "step": 1399 + }, + { + "epoch": 2.48, + "grad_norm": 0.17858561873435974, + "learning_rate": 7.37408150697953e-06, + "loss": 0.0102, + "step": 1400 + }, + { + "epoch": 2.48, + "grad_norm": 0.267673134803772, + "learning_rate": 7.325428563650222e-06, + "loss": 0.0265, + "step": 1401 + }, + { + "epoch": 2.48, + "grad_norm": 0.05871307849884033, + "learning_rate": 7.2769239639766675e-06, + "loss": 0.0044, + "step": 1402 + }, + { + "epoch": 2.48, + "grad_norm": 0.3837791085243225, + "learning_rate": 7.2285678765686645e-06, + "loss": 0.0513, + "step": 1403 + }, + { + "epoch": 2.48, + "grad_norm": 0.13015437126159668, + "learning_rate": 7.180360469519714e-06, + "loss": 0.0079, + "step": 1404 + }, + { + "epoch": 2.49, + "grad_norm": 0.3498382568359375, + "learning_rate": 7.132301910406503e-06, + "loss": 0.0323, + "step": 1405 + }, + { + "epoch": 2.49, + "grad_norm": 0.1156793013215065, + "learning_rate": 7.084392366288295e-06, + "loss": 0.0075, + "step": 1406 + }, + { + "epoch": 2.49, + "grad_norm": 0.3820931613445282, + "learning_rate": 7.036632003706328e-06, + "loss": 0.0135, + "step": 1407 + }, + { + "epoch": 2.49, + "grad_norm": 0.1618475466966629, + "learning_rate": 6.989020988683315e-06, + "loss": 0.0088, + "step": 1408 + }, + { + "epoch": 2.49, + "grad_norm": 0.06308943033218384, + "learning_rate": 6.941559486722748e-06, + "loss": 0.0045, + "step": 1409 + }, + { + "epoch": 2.49, + "grad_norm": 0.018122445791959763, + "learning_rate": 6.894247662808456e-06, + "loss": 0.0014, + "step": 1410 + }, + { + "epoch": 2.5, + "grad_norm": 0.20721711218357086, + "learning_rate": 6.8470856814039145e-06, + "loss": 0.012, + "step": 1411 + }, + { + "epoch": 2.5, + "grad_norm": 0.046544626355171204, + "learning_rate": 6.800073706451721e-06, + "loss": 0.0041, + "step": 1412 + }, + { + "epoch": 2.5, + "grad_norm": 0.041799936443567276, + "learning_rate": 6.753211901373064e-06, + "loss": 0.0028, + "step": 1413 + }, + { + "epoch": 2.5, + "grad_norm": 0.29025766253471375, + "learning_rate": 6.7065004290670755e-06, + "loss": 0.0294, + "step": 1414 + }, + { + "epoch": 2.5, + "grad_norm": 0.3820174038410187, + "learning_rate": 6.659939451910341e-06, + "loss": 0.0126, + "step": 1415 + }, + { + "epoch": 2.51, + "grad_norm": 0.0341438353061676, + "learning_rate": 6.613529131756285e-06, + "loss": 0.0029, + "step": 1416 + }, + { + "epoch": 2.51, + "grad_norm": 0.07663692533969879, + "learning_rate": 6.567269629934614e-06, + "loss": 0.0069, + "step": 1417 + }, + { + "epoch": 2.51, + "grad_norm": 0.5244008898735046, + "learning_rate": 6.5211611072507785e-06, + "loss": 0.0467, + "step": 1418 + }, + { + "epoch": 2.51, + "grad_norm": 0.3728178143501282, + "learning_rate": 6.475203723985418e-06, + "loss": 0.0164, + "step": 1419 + }, + { + "epoch": 2.51, + "grad_norm": 0.06414547562599182, + "learning_rate": 6.429397639893758e-06, + "loss": 0.005, + "step": 1420 + }, + { + "epoch": 2.51, + "eval_loss": 0.09358127415180206, + "eval_runtime": 14.6568, + "eval_samples_per_second": 32.545, + "eval_steps_per_second": 8.187, + "step": 1420 + }, + { + "epoch": 2.51, + "grad_norm": 0.04787755757570267, + "learning_rate": 6.38374301420509e-06, + "loss": 0.0032, + "step": 1421 + }, + { + "epoch": 2.52, + "grad_norm": 0.03525804355740547, + "learning_rate": 6.338240005622209e-06, + "loss": 0.0019, + "step": 1422 + }, + { + "epoch": 2.52, + "grad_norm": 0.04471929371356964, + "learning_rate": 6.292888772320882e-06, + "loss": 0.0026, + "step": 1423 + }, + { + "epoch": 2.52, + "grad_norm": 0.048265084624290466, + "learning_rate": 6.247689471949292e-06, + "loss": 0.0037, + "step": 1424 + }, + { + "epoch": 2.52, + "grad_norm": 0.3734210729598999, + "learning_rate": 6.202642261627411e-06, + "loss": 0.0361, + "step": 1425 + }, + { + "epoch": 2.52, + "grad_norm": 0.028593264520168304, + "learning_rate": 6.157747297946609e-06, + "loss": 0.0018, + "step": 1426 + }, + { + "epoch": 2.52, + "grad_norm": 0.1298915147781372, + "learning_rate": 6.113004736968952e-06, + "loss": 0.0046, + "step": 1427 + }, + { + "epoch": 2.53, + "grad_norm": 0.019907642155885696, + "learning_rate": 6.068414734226774e-06, + "loss": 0.0016, + "step": 1428 + }, + { + "epoch": 2.53, + "grad_norm": 0.030533993616700172, + "learning_rate": 6.023977444722096e-06, + "loss": 0.0021, + "step": 1429 + }, + { + "epoch": 2.53, + "grad_norm": 0.3833365738391876, + "learning_rate": 5.979693022926025e-06, + "loss": 0.0242, + "step": 1430 + }, + { + "epoch": 2.53, + "grad_norm": 0.39871448278427124, + "learning_rate": 5.935561622778335e-06, + "loss": 0.0412, + "step": 1431 + }, + { + "epoch": 2.53, + "grad_norm": 0.31123584508895874, + "learning_rate": 5.891583397686862e-06, + "loss": 0.0145, + "step": 1432 + }, + { + "epoch": 2.54, + "grad_norm": 0.4028448760509491, + "learning_rate": 5.847758500526956e-06, + "loss": 0.0127, + "step": 1433 + }, + { + "epoch": 2.54, + "grad_norm": 0.028875384479761124, + "learning_rate": 5.804087083641002e-06, + "loss": 0.0021, + "step": 1434 + }, + { + "epoch": 2.54, + "grad_norm": 0.46391549706459045, + "learning_rate": 5.7605692988378255e-06, + "loss": 0.0627, + "step": 1435 + }, + { + "epoch": 2.54, + "grad_norm": 0.5691616535186768, + "learning_rate": 5.7172052973922475e-06, + "loss": 0.0561, + "step": 1436 + }, + { + "epoch": 2.54, + "grad_norm": 0.5357887148857117, + "learning_rate": 5.673995230044499e-06, + "loss": 0.0281, + "step": 1437 + }, + { + "epoch": 2.54, + "grad_norm": 0.02342977188527584, + "learning_rate": 5.6309392469996946e-06, + "loss": 0.0021, + "step": 1438 + }, + { + "epoch": 2.55, + "grad_norm": 0.8080065250396729, + "learning_rate": 5.58803749792734e-06, + "loss": 0.055, + "step": 1439 + }, + { + "epoch": 2.55, + "grad_norm": 0.04309003800153732, + "learning_rate": 5.5452901319607894e-06, + "loss": 0.0027, + "step": 1440 + }, + { + "epoch": 2.55, + "grad_norm": 0.07680585235357285, + "learning_rate": 5.502697297696746e-06, + "loss": 0.0036, + "step": 1441 + }, + { + "epoch": 2.55, + "grad_norm": 0.14345233142375946, + "learning_rate": 5.460259143194751e-06, + "loss": 0.006, + "step": 1442 + }, + { + "epoch": 2.55, + "grad_norm": 0.032224975526332855, + "learning_rate": 5.417975815976628e-06, + "loss": 0.0022, + "step": 1443 + }, + { + "epoch": 2.55, + "grad_norm": 0.20706388354301453, + "learning_rate": 5.3758474630259905e-06, + "loss": 0.0102, + "step": 1444 + }, + { + "epoch": 2.56, + "grad_norm": 0.0683453157544136, + "learning_rate": 5.333874230787772e-06, + "loss": 0.0023, + "step": 1445 + }, + { + "epoch": 2.56, + "grad_norm": 0.54808109998703, + "learning_rate": 5.2920562651676445e-06, + "loss": 0.0293, + "step": 1446 + }, + { + "epoch": 2.56, + "grad_norm": 0.3387494385242462, + "learning_rate": 5.250393711531581e-06, + "loss": 0.0095, + "step": 1447 + }, + { + "epoch": 2.56, + "grad_norm": 0.31395724415779114, + "learning_rate": 5.208886714705291e-06, + "loss": 0.0257, + "step": 1448 + }, + { + "epoch": 2.56, + "grad_norm": 0.23459285497665405, + "learning_rate": 5.16753541897374e-06, + "loss": 0.009, + "step": 1449 + }, + { + "epoch": 2.57, + "grad_norm": 0.36793455481529236, + "learning_rate": 5.126339968080696e-06, + "loss": 0.0114, + "step": 1450 + }, + { + "epoch": 2.57, + "grad_norm": 0.5326592326164246, + "learning_rate": 5.0853005052281245e-06, + "loss": 0.0173, + "step": 1451 + }, + { + "epoch": 2.57, + "grad_norm": 0.18404988944530487, + "learning_rate": 5.044417173075805e-06, + "loss": 0.0079, + "step": 1452 + }, + { + "epoch": 2.57, + "grad_norm": 0.20175425708293915, + "learning_rate": 5.00369011374075e-06, + "loss": 0.0072, + "step": 1453 + }, + { + "epoch": 2.57, + "grad_norm": 0.16514746844768524, + "learning_rate": 4.963119468796739e-06, + "loss": 0.0086, + "step": 1454 + }, + { + "epoch": 2.57, + "grad_norm": 0.008991259150207043, + "learning_rate": 4.922705379273862e-06, + "loss": 0.0009, + "step": 1455 + }, + { + "epoch": 2.58, + "grad_norm": 0.12480378895998001, + "learning_rate": 4.8824479856579565e-06, + "loss": 0.0061, + "step": 1456 + }, + { + "epoch": 2.58, + "grad_norm": 0.2886410355567932, + "learning_rate": 4.842347427890198e-06, + "loss": 0.0116, + "step": 1457 + }, + { + "epoch": 2.58, + "grad_norm": 0.19005171954631805, + "learning_rate": 4.802403845366555e-06, + "loss": 0.0077, + "step": 1458 + }, + { + "epoch": 2.58, + "grad_norm": 0.8882156014442444, + "learning_rate": 4.762617376937312e-06, + "loss": 0.0341, + "step": 1459 + }, + { + "epoch": 2.58, + "grad_norm": 0.25773587822914124, + "learning_rate": 4.722988160906638e-06, + "loss": 0.0121, + "step": 1460 + }, + { + "epoch": 2.58, + "grad_norm": 0.0117575041949749, + "learning_rate": 4.683516335032018e-06, + "loss": 0.0007, + "step": 1461 + }, + { + "epoch": 2.59, + "grad_norm": 0.39547544717788696, + "learning_rate": 4.644202036523881e-06, + "loss": 0.0391, + "step": 1462 + }, + { + "epoch": 2.59, + "grad_norm": 0.35935065150260925, + "learning_rate": 4.605045402045022e-06, + "loss": 0.0184, + "step": 1463 + }, + { + "epoch": 2.59, + "grad_norm": 0.021274056285619736, + "learning_rate": 4.566046567710169e-06, + "loss": 0.0021, + "step": 1464 + }, + { + "epoch": 2.59, + "grad_norm": 0.18877848982810974, + "learning_rate": 4.527205669085549e-06, + "loss": 0.0094, + "step": 1465 + }, + { + "epoch": 2.59, + "grad_norm": 0.05266476795077324, + "learning_rate": 4.4885228411883366e-06, + "loss": 0.0031, + "step": 1466 + }, + { + "epoch": 2.6, + "grad_norm": 0.7220010757446289, + "learning_rate": 4.449998218486262e-06, + "loss": 0.0674, + "step": 1467 + }, + { + "epoch": 2.6, + "grad_norm": 0.31085872650146484, + "learning_rate": 4.411631934897093e-06, + "loss": 0.0191, + "step": 1468 + }, + { + "epoch": 2.6, + "grad_norm": 0.16587883234024048, + "learning_rate": 4.3734241237881666e-06, + "loss": 0.0073, + "step": 1469 + }, + { + "epoch": 2.6, + "grad_norm": 0.019949564710259438, + "learning_rate": 4.335374917975981e-06, + "loss": 0.002, + "step": 1470 + }, + { + "epoch": 2.6, + "grad_norm": 0.407232403755188, + "learning_rate": 4.297484449725692e-06, + "loss": 0.0125, + "step": 1471 + }, + { + "epoch": 2.6, + "grad_norm": 0.2886686325073242, + "learning_rate": 4.259752850750609e-06, + "loss": 0.0056, + "step": 1472 + }, + { + "epoch": 2.61, + "grad_norm": 0.07728396356105804, + "learning_rate": 4.222180252211849e-06, + "loss": 0.0045, + "step": 1473 + }, + { + "epoch": 2.61, + "grad_norm": 0.15368296205997467, + "learning_rate": 4.184766784717775e-06, + "loss": 0.0057, + "step": 1474 + }, + { + "epoch": 2.61, + "grad_norm": 0.01975841447710991, + "learning_rate": 4.147512578323614e-06, + "loss": 0.0015, + "step": 1475 + }, + { + "epoch": 2.61, + "grad_norm": 0.07437577843666077, + "learning_rate": 4.110417762530977e-06, + "loss": 0.0054, + "step": 1476 + }, + { + "epoch": 2.61, + "grad_norm": 0.17380701005458832, + "learning_rate": 4.073482466287359e-06, + "loss": 0.0078, + "step": 1477 + }, + { + "epoch": 2.61, + "grad_norm": 0.029353514313697815, + "learning_rate": 4.036706817985803e-06, + "loss": 0.0018, + "step": 1478 + }, + { + "epoch": 2.62, + "grad_norm": 0.368076354265213, + "learning_rate": 4.000090945464341e-06, + "loss": 0.019, + "step": 1479 + }, + { + "epoch": 2.62, + "grad_norm": 0.5591785907745361, + "learning_rate": 3.963634976005642e-06, + "loss": 0.0304, + "step": 1480 + }, + { + "epoch": 2.62, + "grad_norm": 0.041655898094177246, + "learning_rate": 3.927339036336486e-06, + "loss": 0.0014, + "step": 1481 + }, + { + "epoch": 2.62, + "grad_norm": 0.06396479904651642, + "learning_rate": 3.891203252627384e-06, + "loss": 0.0027, + "step": 1482 + }, + { + "epoch": 2.62, + "grad_norm": 0.649625837802887, + "learning_rate": 3.8552277504921185e-06, + "loss": 0.0675, + "step": 1483 + }, + { + "epoch": 2.63, + "grad_norm": 0.008496335707604885, + "learning_rate": 3.819412654987314e-06, + "loss": 0.0008, + "step": 1484 + }, + { + "epoch": 2.63, + "grad_norm": 0.36506280303001404, + "learning_rate": 3.783758090611983e-06, + "loss": 0.0136, + "step": 1485 + }, + { + "epoch": 2.63, + "grad_norm": 0.08308514952659607, + "learning_rate": 3.7482641813071097e-06, + "loss": 0.003, + "step": 1486 + }, + { + "epoch": 2.63, + "grad_norm": 0.03563800826668739, + "learning_rate": 3.712931050455204e-06, + "loss": 0.0021, + "step": 1487 + }, + { + "epoch": 2.63, + "grad_norm": 0.2231430858373642, + "learning_rate": 3.6777588208799117e-06, + "loss": 0.0046, + "step": 1488 + }, + { + "epoch": 2.63, + "grad_norm": 0.4931589365005493, + "learning_rate": 3.6427476148455485e-06, + "loss": 0.0137, + "step": 1489 + }, + { + "epoch": 2.64, + "grad_norm": 0.18545089662075043, + "learning_rate": 3.6078975540566716e-06, + "loss": 0.0084, + "step": 1490 + }, + { + "epoch": 2.64, + "grad_norm": 0.3389565646648407, + "learning_rate": 3.573208759657687e-06, + "loss": 0.0621, + "step": 1491 + }, + { + "epoch": 2.64, + "grad_norm": 0.5997775793075562, + "learning_rate": 3.5386813522324036e-06, + "loss": 0.0247, + "step": 1492 + }, + { + "epoch": 2.64, + "grad_norm": 0.08146101236343384, + "learning_rate": 3.5043154518036304e-06, + "loss": 0.0033, + "step": 1493 + }, + { + "epoch": 2.64, + "grad_norm": 0.016428053379058838, + "learning_rate": 3.4701111778327577e-06, + "loss": 0.0011, + "step": 1494 + }, + { + "epoch": 2.64, + "grad_norm": 0.0878281444311142, + "learning_rate": 3.436068649219326e-06, + "loss": 0.0051, + "step": 1495 + }, + { + "epoch": 2.65, + "grad_norm": 0.07716470211744308, + "learning_rate": 3.4021879843006144e-06, + "loss": 0.0038, + "step": 1496 + }, + { + "epoch": 2.65, + "grad_norm": 0.0328107550740242, + "learning_rate": 3.3684693008512626e-06, + "loss": 0.0014, + "step": 1497 + }, + { + "epoch": 2.65, + "grad_norm": 0.3223980665206909, + "learning_rate": 3.334912716082811e-06, + "loss": 0.0125, + "step": 1498 + }, + { + "epoch": 2.65, + "grad_norm": 0.03705433011054993, + "learning_rate": 3.3015183466433397e-06, + "loss": 0.0019, + "step": 1499 + }, + { + "epoch": 2.65, + "grad_norm": 0.6076542735099792, + "learning_rate": 3.268286308617041e-06, + "loss": 0.1055, + "step": 1500 + }, + { + "epoch": 2.66, + "grad_norm": 0.2399713695049286, + "learning_rate": 3.235216717523787e-06, + "loss": 0.0093, + "step": 1501 + }, + { + "epoch": 2.66, + "grad_norm": 0.34776830673217773, + "learning_rate": 3.2023096883187997e-06, + "loss": 0.0096, + "step": 1502 + }, + { + "epoch": 2.66, + "grad_norm": 0.46878987550735474, + "learning_rate": 3.169565335392183e-06, + "loss": 0.0262, + "step": 1503 + }, + { + "epoch": 2.66, + "grad_norm": 0.26711776852607727, + "learning_rate": 3.136983772568569e-06, + "loss": 0.0074, + "step": 1504 + }, + { + "epoch": 2.66, + "grad_norm": 0.03126642853021622, + "learning_rate": 3.1045651131066887e-06, + "loss": 0.0017, + "step": 1505 + }, + { + "epoch": 2.66, + "grad_norm": 0.2915371060371399, + "learning_rate": 3.072309469699003e-06, + "loss": 0.0191, + "step": 1506 + }, + { + "epoch": 2.67, + "grad_norm": 0.04384929686784744, + "learning_rate": 3.0402169544713086e-06, + "loss": 0.0022, + "step": 1507 + }, + { + "epoch": 2.67, + "grad_norm": 0.48072144389152527, + "learning_rate": 3.0082876789823245e-06, + "loss": 0.0169, + "step": 1508 + }, + { + "epoch": 2.67, + "grad_norm": 0.2749837338924408, + "learning_rate": 2.9765217542233437e-06, + "loss": 0.0103, + "step": 1509 + }, + { + "epoch": 2.67, + "grad_norm": 0.02487136982381344, + "learning_rate": 2.94491929061782e-06, + "loss": 0.0016, + "step": 1510 + }, + { + "epoch": 2.67, + "grad_norm": 0.054373327642679214, + "learning_rate": 2.9134803980209734e-06, + "loss": 0.0028, + "step": 1511 + }, + { + "epoch": 2.67, + "grad_norm": 0.07870666682720184, + "learning_rate": 2.88220518571945e-06, + "loss": 0.0044, + "step": 1512 + }, + { + "epoch": 2.68, + "grad_norm": 0.17833536863327026, + "learning_rate": 2.851093762430895e-06, + "loss": 0.0046, + "step": 1513 + }, + { + "epoch": 2.68, + "grad_norm": 0.07121768593788147, + "learning_rate": 2.8201462363036112e-06, + "loss": 0.0042, + "step": 1514 + }, + { + "epoch": 2.68, + "grad_norm": 0.2510574162006378, + "learning_rate": 2.7893627149161716e-06, + "loss": 0.0102, + "step": 1515 + }, + { + "epoch": 2.68, + "grad_norm": 0.11992470175027847, + "learning_rate": 2.7587433052770118e-06, + "loss": 0.0044, + "step": 1516 + }, + { + "epoch": 2.68, + "grad_norm": 0.2543796896934509, + "learning_rate": 2.72828811382414e-06, + "loss": 0.0058, + "step": 1517 + }, + { + "epoch": 2.69, + "grad_norm": 0.12545326352119446, + "learning_rate": 2.6979972464246606e-06, + "loss": 0.0058, + "step": 1518 + }, + { + "epoch": 2.69, + "grad_norm": 0.06670302897691727, + "learning_rate": 2.667870808374506e-06, + "loss": 0.0026, + "step": 1519 + }, + { + "epoch": 2.69, + "grad_norm": 0.39533159136772156, + "learning_rate": 2.6379089043980067e-06, + "loss": 0.038, + "step": 1520 + }, + { + "epoch": 2.69, + "grad_norm": 0.3432491719722748, + "learning_rate": 2.6081116386475314e-06, + "loss": 0.0173, + "step": 1521 + }, + { + "epoch": 2.69, + "grad_norm": 0.029475800693035126, + "learning_rate": 2.5784791147031638e-06, + "loss": 0.0027, + "step": 1522 + }, + { + "epoch": 2.69, + "grad_norm": 0.16476236283779144, + "learning_rate": 2.5490114355723293e-06, + "loss": 0.0061, + "step": 1523 + }, + { + "epoch": 2.7, + "grad_norm": 0.26185595989227295, + "learning_rate": 2.5197087036893774e-06, + "loss": 0.0102, + "step": 1524 + }, + { + "epoch": 2.7, + "grad_norm": 0.12650197744369507, + "learning_rate": 2.4905710209153223e-06, + "loss": 0.0039, + "step": 1525 + }, + { + "epoch": 2.7, + "grad_norm": 0.6523202657699585, + "learning_rate": 2.4615984885374143e-06, + "loss": 0.02, + "step": 1526 + }, + { + "epoch": 2.7, + "grad_norm": 0.09152957051992416, + "learning_rate": 2.4327912072688307e-06, + "loss": 0.0022, + "step": 1527 + }, + { + "epoch": 2.7, + "grad_norm": 0.5130406022071838, + "learning_rate": 2.404149277248313e-06, + "loss": 0.0369, + "step": 1528 + }, + { + "epoch": 2.7, + "grad_norm": 0.2927793860435486, + "learning_rate": 2.3756727980397743e-06, + "loss": 0.0083, + "step": 1529 + }, + { + "epoch": 2.71, + "grad_norm": 0.38869479298591614, + "learning_rate": 2.3473618686320474e-06, + "loss": 0.0387, + "step": 1530 + }, + { + "epoch": 2.71, + "grad_norm": 0.14148685336112976, + "learning_rate": 2.3192165874384555e-06, + "loss": 0.006, + "step": 1531 + }, + { + "epoch": 2.71, + "grad_norm": 0.4193699359893799, + "learning_rate": 2.291237052296513e-06, + "loss": 0.0144, + "step": 1532 + }, + { + "epoch": 2.71, + "grad_norm": 0.08705022931098938, + "learning_rate": 2.2634233604675815e-06, + "loss": 0.0054, + "step": 1533 + }, + { + "epoch": 2.71, + "grad_norm": 0.5417605638504028, + "learning_rate": 2.235775608636492e-06, + "loss": 0.0136, + "step": 1534 + }, + { + "epoch": 2.72, + "grad_norm": 0.15281569957733154, + "learning_rate": 2.208293892911284e-06, + "loss": 0.0065, + "step": 1535 + }, + { + "epoch": 2.72, + "grad_norm": 0.1463097184896469, + "learning_rate": 2.180978308822812e-06, + "loss": 0.0058, + "step": 1536 + }, + { + "epoch": 2.72, + "grad_norm": 0.10716543346643448, + "learning_rate": 2.1538289513244214e-06, + "loss": 0.0033, + "step": 1537 + }, + { + "epoch": 2.72, + "grad_norm": 0.815475583076477, + "learning_rate": 2.1268459147916307e-06, + "loss": 0.0796, + "step": 1538 + }, + { + "epoch": 2.72, + "grad_norm": 0.04941270127892494, + "learning_rate": 2.100029293021799e-06, + "loss": 0.0022, + "step": 1539 + }, + { + "epoch": 2.72, + "grad_norm": 0.05597170069813728, + "learning_rate": 2.07337917923382e-06, + "loss": 0.003, + "step": 1540 + }, + { + "epoch": 2.73, + "grad_norm": 0.48878607153892517, + "learning_rate": 2.046895666067755e-06, + "loss": 0.0273, + "step": 1541 + }, + { + "epoch": 2.73, + "grad_norm": 0.04340330511331558, + "learning_rate": 2.0205788455845476e-06, + "loss": 0.0025, + "step": 1542 + }, + { + "epoch": 2.73, + "grad_norm": 0.034424763172864914, + "learning_rate": 1.994428809265686e-06, + "loss": 0.0023, + "step": 1543 + }, + { + "epoch": 2.73, + "grad_norm": 0.4512332081794739, + "learning_rate": 1.9684456480128843e-06, + "loss": 0.0714, + "step": 1544 + }, + { + "epoch": 2.73, + "grad_norm": 0.8032733201980591, + "learning_rate": 1.9426294521477873e-06, + "loss": 0.0364, + "step": 1545 + }, + { + "epoch": 2.74, + "grad_norm": 0.23999914526939392, + "learning_rate": 1.916980311411631e-06, + "loss": 0.0106, + "step": 1546 + }, + { + "epoch": 2.74, + "grad_norm": 0.005880038253962994, + "learning_rate": 1.8914983149649512e-06, + "loss": 0.0005, + "step": 1547 + }, + { + "epoch": 2.74, + "grad_norm": 0.3052350878715515, + "learning_rate": 1.866183551387235e-06, + "loss": 0.0624, + "step": 1548 + }, + { + "epoch": 2.74, + "grad_norm": 0.33747848868370056, + "learning_rate": 1.8410361086766804e-06, + "loss": 0.0103, + "step": 1549 + }, + { + "epoch": 2.74, + "grad_norm": 0.43580949306488037, + "learning_rate": 1.8160560742498222e-06, + "loss": 0.0187, + "step": 1550 + }, + { + "epoch": 2.74, + "grad_norm": 0.6287872195243835, + "learning_rate": 1.7912435349412726e-06, + "loss": 0.1101, + "step": 1551 + }, + { + "epoch": 2.75, + "grad_norm": 0.03436359018087387, + "learning_rate": 1.7665985770033976e-06, + "loss": 0.0026, + "step": 1552 + }, + { + "epoch": 2.75, + "grad_norm": 0.031709544360637665, + "learning_rate": 1.7421212861060133e-06, + "loss": 0.0018, + "step": 1553 + }, + { + "epoch": 2.75, + "grad_norm": 0.35390913486480713, + "learning_rate": 1.7178117473361288e-06, + "loss": 0.0095, + "step": 1554 + }, + { + "epoch": 2.75, + "grad_norm": 0.14006419479846954, + "learning_rate": 1.6936700451975817e-06, + "loss": 0.006, + "step": 1555 + }, + { + "epoch": 2.75, + "grad_norm": 0.5810758471488953, + "learning_rate": 1.669696263610815e-06, + "loss": 0.0538, + "step": 1556 + }, + { + "epoch": 2.75, + "grad_norm": 0.026199545711278915, + "learning_rate": 1.6458904859125324e-06, + "loss": 0.0015, + "step": 1557 + }, + { + "epoch": 2.76, + "grad_norm": 0.5300063490867615, + "learning_rate": 1.6222527948554334e-06, + "loss": 0.1131, + "step": 1558 + }, + { + "epoch": 2.76, + "grad_norm": 0.038241319358348846, + "learning_rate": 1.5987832726079343e-06, + "loss": 0.0019, + "step": 1559 + }, + { + "epoch": 2.76, + "grad_norm": 0.25658735632896423, + "learning_rate": 1.5754820007538474e-06, + "loss": 0.0068, + "step": 1560 + }, + { + "epoch": 2.76, + "grad_norm": 0.33151376247406006, + "learning_rate": 1.5523490602921353e-06, + "loss": 0.0099, + "step": 1561 + }, + { + "epoch": 2.76, + "grad_norm": 0.07021140307188034, + "learning_rate": 1.5293845316366183e-06, + "loss": 0.0028, + "step": 1562 + }, + { + "epoch": 2.76, + "eval_loss": 0.0996757373213768, + "eval_runtime": 14.6905, + "eval_samples_per_second": 32.47, + "eval_steps_per_second": 8.169, + "step": 1562 + }, + { + "epoch": 2.77, + "grad_norm": 0.4714253544807434, + "learning_rate": 1.5065884946156683e-06, + "loss": 0.0353, + "step": 1563 + }, + { + "epoch": 2.77, + "grad_norm": 0.04558965936303139, + "learning_rate": 1.4839610284719752e-06, + "loss": 0.0028, + "step": 1564 + }, + { + "epoch": 2.77, + "grad_norm": 0.3195928931236267, + "learning_rate": 1.4615022118622367e-06, + "loss": 0.0107, + "step": 1565 + }, + { + "epoch": 2.77, + "grad_norm": 0.10188216716051102, + "learning_rate": 1.4392121228569088e-06, + "loss": 0.0039, + "step": 1566 + }, + { + "epoch": 2.77, + "grad_norm": 0.3658134639263153, + "learning_rate": 1.4170908389399107e-06, + "loss": 0.0177, + "step": 1567 + }, + { + "epoch": 2.77, + "grad_norm": 0.4253825843334198, + "learning_rate": 1.3951384370083697e-06, + "loss": 0.0468, + "step": 1568 + }, + { + "epoch": 2.78, + "grad_norm": 0.3173588812351227, + "learning_rate": 1.3733549933723666e-06, + "loss": 0.0382, + "step": 1569 + }, + { + "epoch": 2.78, + "grad_norm": 0.423880934715271, + "learning_rate": 1.3517405837546403e-06, + "loss": 0.024, + "step": 1570 + }, + { + "epoch": 2.78, + "grad_norm": 0.12553349137306213, + "learning_rate": 1.3302952832903393e-06, + "loss": 0.0055, + "step": 1571 + }, + { + "epoch": 2.78, + "grad_norm": 0.2536146640777588, + "learning_rate": 1.3090191665267814e-06, + "loss": 0.009, + "step": 1572 + }, + { + "epoch": 2.78, + "grad_norm": 0.8198730945587158, + "learning_rate": 1.2879123074231502e-06, + "loss": 0.0686, + "step": 1573 + }, + { + "epoch": 2.78, + "grad_norm": 0.15339438617229462, + "learning_rate": 1.2669747793502828e-06, + "loss": 0.0071, + "step": 1574 + }, + { + "epoch": 2.79, + "grad_norm": 0.5873886942863464, + "learning_rate": 1.2462066550903817e-06, + "loss": 0.0587, + "step": 1575 + }, + { + "epoch": 2.79, + "grad_norm": 0.30819711089134216, + "learning_rate": 1.2256080068367758e-06, + "loss": 0.0544, + "step": 1576 + }, + { + "epoch": 2.79, + "grad_norm": 0.1868467926979065, + "learning_rate": 1.2051789061936714e-06, + "loss": 0.0848, + "step": 1577 + }, + { + "epoch": 2.79, + "grad_norm": 0.04845046252012253, + "learning_rate": 1.184919424175901e-06, + "loss": 0.0031, + "step": 1578 + }, + { + "epoch": 2.79, + "grad_norm": 0.14416372776031494, + "learning_rate": 1.164829631208675e-06, + "loss": 0.0042, + "step": 1579 + }, + { + "epoch": 2.8, + "grad_norm": 0.5273883938789368, + "learning_rate": 1.1449095971273305e-06, + "loss": 0.0232, + "step": 1580 + }, + { + "epoch": 2.8, + "grad_norm": 0.039162758737802505, + "learning_rate": 1.1251593911771053e-06, + "loss": 0.0027, + "step": 1581 + }, + { + "epoch": 2.8, + "grad_norm": 0.12024156749248505, + "learning_rate": 1.105579082012892e-06, + "loss": 0.0073, + "step": 1582 + }, + { + "epoch": 2.8, + "grad_norm": 0.03360385447740555, + "learning_rate": 1.086168737698967e-06, + "loss": 0.0016, + "step": 1583 + }, + { + "epoch": 2.8, + "grad_norm": 0.21495912969112396, + "learning_rate": 1.0669284257088185e-06, + "loss": 0.0083, + "step": 1584 + }, + { + "epoch": 2.8, + "grad_norm": 0.04339810088276863, + "learning_rate": 1.0478582129248515e-06, + "loss": 0.0029, + "step": 1585 + }, + { + "epoch": 2.81, + "grad_norm": 0.5246281623840332, + "learning_rate": 1.0289581656381775e-06, + "loss": 0.0154, + "step": 1586 + }, + { + "epoch": 2.81, + "grad_norm": 0.017450956627726555, + "learning_rate": 1.0102283495483978e-06, + "loss": 0.0015, + "step": 1587 + }, + { + "epoch": 2.81, + "grad_norm": 0.045874420553445816, + "learning_rate": 9.916688297633648e-07, + "loss": 0.003, + "step": 1588 + }, + { + "epoch": 2.81, + "grad_norm": 0.621476411819458, + "learning_rate": 9.732796707989378e-07, + "loss": 0.0435, + "step": 1589 + }, + { + "epoch": 2.81, + "grad_norm": 0.09277229011058807, + "learning_rate": 9.550609365787888e-07, + "loss": 0.004, + "step": 1590 + }, + { + "epoch": 2.81, + "grad_norm": 0.22077274322509766, + "learning_rate": 9.37012690434147e-07, + "loss": 0.0138, + "step": 1591 + }, + { + "epoch": 2.82, + "grad_norm": 0.029292916879057884, + "learning_rate": 9.191349951036266e-07, + "loss": 0.002, + "step": 1592 + }, + { + "epoch": 2.82, + "grad_norm": 0.12496867030858994, + "learning_rate": 9.014279127329606e-07, + "loss": 0.004, + "step": 1593 + }, + { + "epoch": 2.82, + "grad_norm": 0.14724105596542358, + "learning_rate": 8.838915048748064e-07, + "loss": 0.005, + "step": 1594 + }, + { + "epoch": 2.82, + "grad_norm": 0.48114821314811707, + "learning_rate": 8.66525832488535e-07, + "loss": 0.0225, + "step": 1595 + }, + { + "epoch": 2.82, + "grad_norm": 0.05105943605303764, + "learning_rate": 8.493309559399976e-07, + "loss": 0.0031, + "step": 1596 + }, + { + "epoch": 2.83, + "grad_norm": 0.04393310844898224, + "learning_rate": 8.32306935001348e-07, + "loss": 0.0032, + "step": 1597 + }, + { + "epoch": 2.83, + "grad_norm": 0.07127777487039566, + "learning_rate": 8.154538288508206e-07, + "loss": 0.0027, + "step": 1598 + }, + { + "epoch": 2.83, + "grad_norm": 0.1620861142873764, + "learning_rate": 7.987716960725145e-07, + "loss": 0.005, + "step": 1599 + }, + { + "epoch": 2.83, + "grad_norm": 0.45601749420166016, + "learning_rate": 7.822605946561923e-07, + "loss": 0.0424, + "step": 1600 + }, + { + "epoch": 2.83, + "grad_norm": 0.04331004619598389, + "learning_rate": 7.659205819970927e-07, + "loss": 0.0023, + "step": 1601 + }, + { + "epoch": 2.83, + "grad_norm": 0.06400591880083084, + "learning_rate": 7.497517148957245e-07, + "loss": 0.0029, + "step": 1602 + }, + { + "epoch": 2.84, + "grad_norm": 0.12373822182416916, + "learning_rate": 7.337540495576667e-07, + "loss": 0.0044, + "step": 1603 + }, + { + "epoch": 2.84, + "grad_norm": 0.18355102837085724, + "learning_rate": 7.179276415933634e-07, + "loss": 0.0065, + "step": 1604 + }, + { + "epoch": 2.84, + "grad_norm": 0.07238329201936722, + "learning_rate": 7.022725460179457e-07, + "loss": 0.0033, + "step": 1605 + }, + { + "epoch": 2.84, + "grad_norm": 0.4453037977218628, + "learning_rate": 6.86788817251044e-07, + "loss": 0.0274, + "step": 1606 + }, + { + "epoch": 2.84, + "grad_norm": 0.28667303919792175, + "learning_rate": 6.714765091165809e-07, + "loss": 0.0198, + "step": 1607 + }, + { + "epoch": 2.84, + "grad_norm": 0.16724398732185364, + "learning_rate": 6.563356748425953e-07, + "loss": 0.0038, + "step": 1608 + }, + { + "epoch": 2.85, + "grad_norm": 0.428133100271225, + "learning_rate": 6.413663670610525e-07, + "loss": 0.0107, + "step": 1609 + }, + { + "epoch": 2.85, + "grad_norm": 0.17091481387615204, + "learning_rate": 6.265686378076729e-07, + "loss": 0.0085, + "step": 1610 + }, + { + "epoch": 2.85, + "grad_norm": 0.25742030143737793, + "learning_rate": 6.119425385217315e-07, + "loss": 0.0076, + "step": 1611 + }, + { + "epoch": 2.85, + "grad_norm": 0.02969309687614441, + "learning_rate": 5.974881200459026e-07, + "loss": 0.0021, + "step": 1612 + }, + { + "epoch": 2.85, + "grad_norm": 0.7654445767402649, + "learning_rate": 5.832054326260606e-07, + "loss": 0.0272, + "step": 1613 + }, + { + "epoch": 2.86, + "grad_norm": 0.030291326344013214, + "learning_rate": 5.69094525911118e-07, + "loss": 0.0011, + "step": 1614 + }, + { + "epoch": 2.86, + "grad_norm": 0.038292497396469116, + "learning_rate": 5.551554489528432e-07, + "loss": 0.0017, + "step": 1615 + }, + { + "epoch": 2.86, + "grad_norm": 0.5287438035011292, + "learning_rate": 5.413882502057155e-07, + "loss": 0.047, + "step": 1616 + }, + { + "epoch": 2.86, + "grad_norm": 0.10966096073389053, + "learning_rate": 5.277929775267143e-07, + "loss": 0.0045, + "step": 1617 + }, + { + "epoch": 2.86, + "grad_norm": 0.35922273993492126, + "learning_rate": 5.143696781751972e-07, + "loss": 0.0226, + "step": 1618 + }, + { + "epoch": 2.86, + "grad_norm": 0.45295917987823486, + "learning_rate": 5.011183988127055e-07, + "loss": 0.0161, + "step": 1619 + }, + { + "epoch": 2.87, + "grad_norm": 0.5588263273239136, + "learning_rate": 4.880391855028088e-07, + "loss": 0.1318, + "step": 1620 + }, + { + "epoch": 2.87, + "grad_norm": 0.6208536624908447, + "learning_rate": 4.751320837109552e-07, + "loss": 0.0286, + "step": 1621 + }, + { + "epoch": 2.87, + "grad_norm": 0.7441438436508179, + "learning_rate": 4.623971383042935e-07, + "loss": 0.0583, + "step": 1622 + }, + { + "epoch": 2.87, + "grad_norm": 0.3858100473880768, + "learning_rate": 4.498343935515348e-07, + "loss": 0.0374, + "step": 1623 + }, + { + "epoch": 2.87, + "grad_norm": 0.7004556059837341, + "learning_rate": 4.374438931228075e-07, + "loss": 0.1056, + "step": 1624 + }, + { + "epoch": 2.87, + "grad_norm": 0.24075275659561157, + "learning_rate": 4.252256800894694e-07, + "loss": 0.0055, + "step": 1625 + }, + { + "epoch": 2.88, + "grad_norm": 0.14794984459877014, + "learning_rate": 4.1317979692399034e-07, + "loss": 0.0077, + "step": 1626 + }, + { + "epoch": 2.88, + "grad_norm": 0.5345852971076965, + "learning_rate": 4.0130628549980286e-07, + "loss": 0.0193, + "step": 1627 + }, + { + "epoch": 2.88, + "grad_norm": 0.045396577566862106, + "learning_rate": 3.896051870911188e-07, + "loss": 0.0035, + "step": 1628 + }, + { + "epoch": 2.88, + "grad_norm": 0.1260155439376831, + "learning_rate": 3.7807654237284607e-07, + "loss": 0.0055, + "step": 1629 + }, + { + "epoch": 2.88, + "grad_norm": 0.07393728196620941, + "learning_rate": 3.667203914203943e-07, + "loss": 0.0046, + "step": 1630 + }, + { + "epoch": 2.89, + "grad_norm": 0.3849566876888275, + "learning_rate": 3.555367737095749e-07, + "loss": 0.0093, + "step": 1631 + }, + { + "epoch": 2.89, + "grad_norm": 0.6450923681259155, + "learning_rate": 3.445257281164349e-07, + "loss": 0.0375, + "step": 1632 + }, + { + "epoch": 2.89, + "grad_norm": 0.1818339079618454, + "learning_rate": 3.336872929171286e-07, + "loss": 0.0072, + "step": 1633 + }, + { + "epoch": 2.89, + "grad_norm": 0.04328285530209541, + "learning_rate": 3.230215057878017e-07, + "loss": 0.0019, + "step": 1634 + }, + { + "epoch": 2.89, + "grad_norm": 0.07304265350103378, + "learning_rate": 3.1252840380444073e-07, + "loss": 0.0032, + "step": 1635 + }, + { + "epoch": 2.89, + "grad_norm": 0.38675254583358765, + "learning_rate": 3.0220802344275157e-07, + "loss": 0.0082, + "step": 1636 + }, + { + "epoch": 2.9, + "grad_norm": 0.41091272234916687, + "learning_rate": 2.920604005780259e-07, + "loss": 0.0573, + "step": 1637 + }, + { + "epoch": 2.9, + "grad_norm": 0.024869756773114204, + "learning_rate": 2.820855704850356e-07, + "loss": 0.0017, + "step": 1638 + }, + { + "epoch": 2.9, + "grad_norm": 0.1491108238697052, + "learning_rate": 2.7228356783788877e-07, + "loss": 0.0083, + "step": 1639 + }, + { + "epoch": 2.9, + "grad_norm": 0.09827906638383865, + "learning_rate": 2.626544267099129e-07, + "loss": 0.0043, + "step": 1640 + }, + { + "epoch": 2.9, + "grad_norm": 0.04465942829847336, + "learning_rate": 2.531981805735606e-07, + "loss": 0.0025, + "step": 1641 + }, + { + "epoch": 2.9, + "grad_norm": 0.05105120688676834, + "learning_rate": 2.43914862300243e-07, + "loss": 0.0027, + "step": 1642 + }, + { + "epoch": 2.91, + "grad_norm": 0.0723058357834816, + "learning_rate": 2.3480450416027422e-07, + "loss": 0.0025, + "step": 1643 + }, + { + "epoch": 2.91, + "grad_norm": 0.24974624812602997, + "learning_rate": 2.2586713782272174e-07, + "loss": 0.007, + "step": 1644 + }, + { + "epoch": 2.91, + "grad_norm": 0.09947193413972855, + "learning_rate": 2.1710279435530057e-07, + "loss": 0.0056, + "step": 1645 + }, + { + "epoch": 2.91, + "grad_norm": 0.47081825137138367, + "learning_rate": 2.085115042242791e-07, + "loss": 0.0227, + "step": 1646 + }, + { + "epoch": 2.91, + "grad_norm": 0.3067363202571869, + "learning_rate": 2.0009329729435145e-07, + "loss": 0.0145, + "step": 1647 + }, + { + "epoch": 2.92, + "grad_norm": 0.2780928909778595, + "learning_rate": 1.9184820282855953e-07, + "loss": 0.0176, + "step": 1648 + }, + { + "epoch": 2.92, + "grad_norm": 0.7477846145629883, + "learning_rate": 1.8377624948817674e-07, + "loss": 0.0167, + "step": 1649 + }, + { + "epoch": 2.92, + "grad_norm": 0.5249376893043518, + "learning_rate": 1.7587746533260784e-07, + "loss": 0.0176, + "step": 1650 + }, + { + "epoch": 2.92, + "grad_norm": 0.3156394958496094, + "learning_rate": 1.6815187781928922e-07, + "loss": 0.0162, + "step": 1651 + }, + { + "epoch": 2.92, + "grad_norm": 0.033656030893325806, + "learning_rate": 1.6059951380359983e-07, + "loss": 0.0025, + "step": 1652 + }, + { + "epoch": 2.92, + "grad_norm": 0.32112687826156616, + "learning_rate": 1.5322039953878374e-07, + "loss": 0.015, + "step": 1653 + }, + { + "epoch": 2.93, + "grad_norm": 0.09272444248199463, + "learning_rate": 1.4601456067580565e-07, + "loss": 0.0059, + "step": 1654 + }, + { + "epoch": 2.93, + "grad_norm": 0.4658186137676239, + "learning_rate": 1.3898202226333423e-07, + "loss": 0.0445, + "step": 1655 + }, + { + "epoch": 2.93, + "grad_norm": 0.07326341420412064, + "learning_rate": 1.321228087475923e-07, + "loss": 0.0031, + "step": 1656 + }, + { + "epoch": 2.93, + "grad_norm": 0.05622444674372673, + "learning_rate": 1.2543694397230686e-07, + "loss": 0.0023, + "step": 1657 + }, + { + "epoch": 2.93, + "grad_norm": 0.7486432194709778, + "learning_rate": 1.1892445117862028e-07, + "loss": 0.0196, + "step": 1658 + }, + { + "epoch": 2.93, + "grad_norm": 0.27966606616973877, + "learning_rate": 1.1258535300499584e-07, + "loss": 0.0046, + "step": 1659 + }, + { + "epoch": 2.94, + "grad_norm": 0.5662534832954407, + "learning_rate": 1.0641967148716236e-07, + "loss": 0.0637, + "step": 1660 + }, + { + "epoch": 2.94, + "grad_norm": 0.2280278503894806, + "learning_rate": 1.004274280580142e-07, + "loss": 0.0084, + "step": 1661 + }, + { + "epoch": 2.94, + "grad_norm": 0.2407422810792923, + "learning_rate": 9.460864354755017e-08, + "loss": 0.0114, + "step": 1662 + }, + { + "epoch": 2.94, + "grad_norm": 0.41225409507751465, + "learning_rate": 8.896333818280145e-08, + "loss": 0.011, + "step": 1663 + }, + { + "epoch": 2.94, + "grad_norm": 0.26500916481018066, + "learning_rate": 8.349153158774824e-08, + "loss": 0.0133, + "step": 1664 + }, + { + "epoch": 2.95, + "grad_norm": 0.22338826954364777, + "learning_rate": 7.8193242783281e-08, + "loss": 0.011, + "step": 1665 + }, + { + "epoch": 2.95, + "grad_norm": 0.7501424551010132, + "learning_rate": 7.306849018708928e-08, + "loss": 0.0401, + "step": 1666 + }, + { + "epoch": 2.95, + "grad_norm": 0.10831818729639053, + "learning_rate": 6.811729161363967e-08, + "loss": 0.0042, + "step": 1667 + }, + { + "epoch": 2.95, + "grad_norm": 0.6130200624465942, + "learning_rate": 6.333966427409243e-08, + "loss": 0.017, + "step": 1668 + }, + { + "epoch": 2.95, + "grad_norm": 0.3362162411212921, + "learning_rate": 5.8735624776246056e-08, + "loss": 0.0222, + "step": 1669 + }, + { + "epoch": 2.95, + "grad_norm": 0.09215787053108215, + "learning_rate": 5.430518912448168e-08, + "loss": 0.0035, + "step": 1670 + }, + { + "epoch": 2.96, + "grad_norm": 0.5068495273590088, + "learning_rate": 5.0048372719707635e-08, + "loss": 0.0215, + "step": 1671 + }, + { + "epoch": 2.96, + "grad_norm": 0.1071295291185379, + "learning_rate": 4.596519035929281e-08, + "loss": 0.0029, + "step": 1672 + }, + { + "epoch": 2.96, + "grad_norm": 0.037402596324682236, + "learning_rate": 4.205565623703889e-08, + "loss": 0.0016, + "step": 1673 + }, + { + "epoch": 2.96, + "grad_norm": 0.52506023645401, + "learning_rate": 3.8319783943108204e-08, + "loss": 0.0606, + "step": 1674 + }, + { + "epoch": 2.96, + "grad_norm": 0.25033050775527954, + "learning_rate": 3.475758646400151e-08, + "loss": 0.0072, + "step": 1675 + }, + { + "epoch": 2.97, + "grad_norm": 0.3952895700931549, + "learning_rate": 3.1369076182480304e-08, + "loss": 0.0142, + "step": 1676 + }, + { + "epoch": 2.97, + "grad_norm": 0.04407460615038872, + "learning_rate": 2.815426487755568e-08, + "loss": 0.0018, + "step": 1677 + }, + { + "epoch": 2.97, + "grad_norm": 0.23342104256153107, + "learning_rate": 2.511316372442729e-08, + "loss": 0.0098, + "step": 1678 + }, + { + "epoch": 2.97, + "grad_norm": 0.23018397390842438, + "learning_rate": 2.2245783294444488e-08, + "loss": 0.0044, + "step": 1679 + }, + { + "epoch": 2.97, + "grad_norm": 0.058637138456106186, + "learning_rate": 1.9552133555084117e-08, + "loss": 0.0046, + "step": 1680 + }, + { + "epoch": 2.97, + "grad_norm": 0.25334253907203674, + "learning_rate": 1.7032223869911656e-08, + "loss": 0.0088, + "step": 1681 + }, + { + "epoch": 2.98, + "grad_norm": 0.0439915731549263, + "learning_rate": 1.4686062998525706e-08, + "loss": 0.0024, + "step": 1682 + }, + { + "epoch": 2.98, + "grad_norm": 0.019683513790369034, + "learning_rate": 1.2513659096569097e-08, + "loss": 0.0011, + "step": 1683 + }, + { + "epoch": 2.98, + "grad_norm": 0.11386776715517044, + "learning_rate": 1.0515019715656716e-08, + "loss": 0.0045, + "step": 1684 + }, + { + "epoch": 2.98, + "grad_norm": 0.1333921104669571, + "learning_rate": 8.690151803386616e-09, + "loss": 0.0066, + "step": 1685 + }, + { + "epoch": 2.98, + "grad_norm": 0.4892192482948303, + "learning_rate": 7.039061703284499e-09, + "loss": 0.0174, + "step": 1686 + }, + { + "epoch": 2.98, + "grad_norm": 0.043217986822128296, + "learning_rate": 5.561755154814829e-09, + "loss": 0.0026, + "step": 1687 + }, + { + "epoch": 2.99, + "grad_norm": 0.393970787525177, + "learning_rate": 4.258237293325307e-09, + "loss": 0.0279, + "step": 1688 + }, + { + "epoch": 2.99, + "grad_norm": 0.04774940386414528, + "learning_rate": 3.1285126500579796e-09, + "loss": 0.0031, + "step": 1689 + }, + { + "epoch": 2.99, + "grad_norm": 0.1383121758699417, + "learning_rate": 2.1725851521103846e-09, + "loss": 0.006, + "step": 1690 + }, + { + "epoch": 2.99, + "grad_norm": 0.4566024839878082, + "learning_rate": 1.390458122441096e-09, + "loss": 0.0457, + "step": 1691 + }, + { + "epoch": 2.99, + "grad_norm": 0.7104862928390503, + "learning_rate": 7.82134279853075e-10, + "loss": 0.0434, + "step": 1692 + }, + { + "epoch": 3.0, + "grad_norm": 0.48433512449264526, + "learning_rate": 3.4761573897701403e-10, + "loss": 0.0253, + "step": 1693 + }, + { + "epoch": 3.0, + "grad_norm": 0.26530519127845764, + "learning_rate": 8.690401026578698e-11, + "loss": 0.006, + "step": 1694 + }, + { + "epoch": 3.0, + "grad_norm": 0.14494752883911133, + "learning_rate": 0.0, + "loss": 0.0044, + "step": 1695 + } + ], + "logging_steps": 1, + "max_steps": 1695, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 565, + "total_flos": 1.549439947809751e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}