diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4807 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9816849816849818, + "eval_steps": 500, + "global_step": 682, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0029304029304029304, + "grad_norm": 5.236616966747348, + "learning_rate": 3.3333333333333335e-07, + "loss": 1.6037, + "step": 1 + }, + { + "epoch": 0.005860805860805861, + "grad_norm": 4.277013116202448, + "learning_rate": 6.666666666666667e-07, + "loss": 1.5162, + "step": 2 + }, + { + "epoch": 0.008791208791208791, + "grad_norm": 5.329467381575862, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5559, + "step": 3 + }, + { + "epoch": 0.011721611721611722, + "grad_norm": 4.6957386419713805, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.5187, + "step": 4 + }, + { + "epoch": 0.014652014652014652, + "grad_norm": 4.177469884648981, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.4766, + "step": 5 + }, + { + "epoch": 0.017582417582417582, + "grad_norm": 5.102737364259812, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.5765, + "step": 6 + }, + { + "epoch": 0.020512820512820513, + "grad_norm": 3.257410120888959, + "learning_rate": 2.3333333333333336e-06, + "loss": 1.4471, + "step": 7 + }, + { + "epoch": 0.023443223443223443, + "grad_norm": 3.115059473576919, + "learning_rate": 2.666666666666667e-06, + "loss": 1.3834, + "step": 8 + }, + { + "epoch": 0.026373626373626374, + "grad_norm": 2.1258574044969984, + "learning_rate": 3e-06, + "loss": 1.3919, + "step": 9 + }, + { + "epoch": 0.029304029304029304, + "grad_norm": 2.0531292894889805, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.3591, + "step": 10 + }, + { + "epoch": 0.03223443223443224, + "grad_norm": 2.06956179185827, + "learning_rate": 3.6666666666666666e-06, + "loss": 1.4242, + "step": 11 + }, + { + "epoch": 0.035164835164835165, + "grad_norm": 2.303837943491906, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3092, + "step": 12 + }, + { + "epoch": 0.0380952380952381, + "grad_norm": 2.2605964849950344, + "learning_rate": 4.333333333333334e-06, + "loss": 1.2986, + "step": 13 + }, + { + "epoch": 0.041025641025641026, + "grad_norm": 1.9951383138249523, + "learning_rate": 4.666666666666667e-06, + "loss": 1.2942, + "step": 14 + }, + { + "epoch": 0.04395604395604396, + "grad_norm": 1.4898524610077184, + "learning_rate": 5e-06, + "loss": 1.2985, + "step": 15 + }, + { + "epoch": 0.046886446886446886, + "grad_norm": 1.3109439673909382, + "learning_rate": 5.333333333333334e-06, + "loss": 1.244, + "step": 16 + }, + { + "epoch": 0.04981684981684982, + "grad_norm": 1.1655623220122422, + "learning_rate": 5.666666666666667e-06, + "loss": 1.1808, + "step": 17 + }, + { + "epoch": 0.05274725274725275, + "grad_norm": 1.099398694116044, + "learning_rate": 6e-06, + "loss": 1.2938, + "step": 18 + }, + { + "epoch": 0.05567765567765568, + "grad_norm": 1.1377973094630334, + "learning_rate": 6.333333333333333e-06, + "loss": 1.2727, + "step": 19 + }, + { + "epoch": 0.05860805860805861, + "grad_norm": 0.9305342408980034, + "learning_rate": 6.666666666666667e-06, + "loss": 1.2282, + "step": 20 + }, + { + "epoch": 0.06153846153846154, + "grad_norm": 0.8573945958738098, + "learning_rate": 7e-06, + "loss": 1.2088, + "step": 21 + }, + { + "epoch": 0.06446886446886448, + "grad_norm": 0.9176308767455088, + "learning_rate": 7.333333333333333e-06, + "loss": 1.213, + "step": 22 + }, + { + "epoch": 0.0673992673992674, + "grad_norm": 1.0602477117511504, + "learning_rate": 7.666666666666667e-06, + "loss": 1.256, + "step": 23 + }, + { + "epoch": 0.07032967032967033, + "grad_norm": 0.9793994066565616, + "learning_rate": 8.000000000000001e-06, + "loss": 1.197, + "step": 24 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 1.0029818180142296, + "learning_rate": 8.333333333333334e-06, + "loss": 1.137, + "step": 25 + }, + { + "epoch": 0.0761904761904762, + "grad_norm": 1.0195465294285324, + "learning_rate": 8.666666666666668e-06, + "loss": 1.2386, + "step": 26 + }, + { + "epoch": 0.07912087912087912, + "grad_norm": 1.0759683772319115, + "learning_rate": 9e-06, + "loss": 1.2376, + "step": 27 + }, + { + "epoch": 0.08205128205128205, + "grad_norm": 0.8911061397225324, + "learning_rate": 9.333333333333334e-06, + "loss": 1.1831, + "step": 28 + }, + { + "epoch": 0.08498168498168499, + "grad_norm": 0.861354167362915, + "learning_rate": 9.666666666666667e-06, + "loss": 1.1511, + "step": 29 + }, + { + "epoch": 0.08791208791208792, + "grad_norm": 0.8881533584446896, + "learning_rate": 1e-05, + "loss": 1.228, + "step": 30 + }, + { + "epoch": 0.09084249084249084, + "grad_norm": 0.8512130063857013, + "learning_rate": 9.999941957819112e-06, + "loss": 1.2142, + "step": 31 + }, + { + "epoch": 0.09377289377289377, + "grad_norm": 0.880628181688528, + "learning_rate": 9.999767832624e-06, + "loss": 1.2078, + "step": 32 + }, + { + "epoch": 0.0967032967032967, + "grad_norm": 0.8855214361195558, + "learning_rate": 9.99947762845731e-06, + "loss": 1.2384, + "step": 33 + }, + { + "epoch": 0.09963369963369964, + "grad_norm": 0.9160758544701845, + "learning_rate": 9.999071352056676e-06, + "loss": 1.1859, + "step": 34 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 0.8031365392857486, + "learning_rate": 9.998549012854562e-06, + "loss": 1.1149, + "step": 35 + }, + { + "epoch": 0.1054945054945055, + "grad_norm": 0.7882701972390256, + "learning_rate": 9.99791062297805e-06, + "loss": 1.1174, + "step": 36 + }, + { + "epoch": 0.10842490842490843, + "grad_norm": 0.8172216850403872, + "learning_rate": 9.997156197248564e-06, + "loss": 1.2042, + "step": 37 + }, + { + "epoch": 0.11135531135531136, + "grad_norm": 0.9084324585899033, + "learning_rate": 9.996285753181499e-06, + "loss": 1.1251, + "step": 38 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.8107056469020013, + "learning_rate": 9.99529931098585e-06, + "loss": 1.1847, + "step": 39 + }, + { + "epoch": 0.11721611721611722, + "grad_norm": 0.8449584359735586, + "learning_rate": 9.994196893563722e-06, + "loss": 1.1505, + "step": 40 + }, + { + "epoch": 0.12014652014652015, + "grad_norm": 0.8831422669635972, + "learning_rate": 9.992978526509794e-06, + "loss": 1.1699, + "step": 41 + }, + { + "epoch": 0.12307692307692308, + "grad_norm": 0.8644767120795944, + "learning_rate": 9.991644238110741e-06, + "loss": 1.2478, + "step": 42 + }, + { + "epoch": 0.126007326007326, + "grad_norm": 0.9482277505129767, + "learning_rate": 9.990194059344565e-06, + "loss": 1.2268, + "step": 43 + }, + { + "epoch": 0.12893772893772895, + "grad_norm": 0.8617016918844874, + "learning_rate": 9.988628023879883e-06, + "loss": 1.1817, + "step": 44 + }, + { + "epoch": 0.13186813186813187, + "grad_norm": 0.8061790018852762, + "learning_rate": 9.98694616807514e-06, + "loss": 1.1482, + "step": 45 + }, + { + "epoch": 0.1347985347985348, + "grad_norm": 0.7832884251827952, + "learning_rate": 9.985148530977767e-06, + "loss": 1.1317, + "step": 46 + }, + { + "epoch": 0.13772893772893774, + "grad_norm": 0.9771610956162755, + "learning_rate": 9.983235154323276e-06, + "loss": 1.2133, + "step": 47 + }, + { + "epoch": 0.14065934065934066, + "grad_norm": 0.6863448092272799, + "learning_rate": 9.981206082534287e-06, + "loss": 1.0658, + "step": 48 + }, + { + "epoch": 0.14358974358974358, + "grad_norm": 0.7171230395493451, + "learning_rate": 9.979061362719502e-06, + "loss": 1.0827, + "step": 49 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 0.8336856657882589, + "learning_rate": 9.976801044672608e-06, + "loss": 1.1807, + "step": 50 + }, + { + "epoch": 0.14945054945054945, + "grad_norm": 0.7702137284348907, + "learning_rate": 9.974425180871118e-06, + "loss": 1.0692, + "step": 51 + }, + { + "epoch": 0.1523809523809524, + "grad_norm": 0.7532169814888361, + "learning_rate": 9.971933826475162e-06, + "loss": 1.1026, + "step": 52 + }, + { + "epoch": 0.15531135531135531, + "grad_norm": 0.6885436983052671, + "learning_rate": 9.969327039326193e-06, + "loss": 1.1867, + "step": 53 + }, + { + "epoch": 0.15824175824175823, + "grad_norm": 0.8133906289524008, + "learning_rate": 9.966604879945659e-06, + "loss": 1.1775, + "step": 54 + }, + { + "epoch": 0.16117216117216118, + "grad_norm": 0.7968024513483275, + "learning_rate": 9.963767411533584e-06, + "loss": 1.113, + "step": 55 + }, + { + "epoch": 0.1641025641025641, + "grad_norm": 0.8313193163815186, + "learning_rate": 9.960814699967112e-06, + "loss": 1.1637, + "step": 56 + }, + { + "epoch": 0.16703296703296702, + "grad_norm": 0.9517211139026077, + "learning_rate": 9.95774681379897e-06, + "loss": 1.134, + "step": 57 + }, + { + "epoch": 0.16996336996336997, + "grad_norm": 0.8301120009736043, + "learning_rate": 9.954563824255879e-06, + "loss": 1.1291, + "step": 58 + }, + { + "epoch": 0.1728937728937729, + "grad_norm": 0.8270284845492123, + "learning_rate": 9.951265805236903e-06, + "loss": 1.1349, + "step": 59 + }, + { + "epoch": 0.17582417582417584, + "grad_norm": 0.7017311057617309, + "learning_rate": 9.947852833311725e-06, + "loss": 1.1389, + "step": 60 + }, + { + "epoch": 0.17875457875457876, + "grad_norm": 0.7621669276766957, + "learning_rate": 9.94432498771888e-06, + "loss": 1.1036, + "step": 61 + }, + { + "epoch": 0.18168498168498168, + "grad_norm": 0.8320291744185084, + "learning_rate": 9.940682350363913e-06, + "loss": 1.0958, + "step": 62 + }, + { + "epoch": 0.18461538461538463, + "grad_norm": 0.8835787608753064, + "learning_rate": 9.936925005817464e-06, + "loss": 1.1471, + "step": 63 + }, + { + "epoch": 0.18754578754578755, + "grad_norm": 0.749274429167846, + "learning_rate": 9.933053041313325e-06, + "loss": 1.1325, + "step": 64 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.7262497057804643, + "learning_rate": 9.929066546746403e-06, + "loss": 1.1456, + "step": 65 + }, + { + "epoch": 0.1934065934065934, + "grad_norm": 0.8670981015624153, + "learning_rate": 9.924965614670629e-06, + "loss": 1.138, + "step": 66 + }, + { + "epoch": 0.19633699633699633, + "grad_norm": 0.8432940149428723, + "learning_rate": 9.920750340296823e-06, + "loss": 1.1176, + "step": 67 + }, + { + "epoch": 0.19926739926739928, + "grad_norm": 0.7312143321816789, + "learning_rate": 9.916420821490474e-06, + "loss": 1.1434, + "step": 68 + }, + { + "epoch": 0.2021978021978022, + "grad_norm": 0.7605667410112772, + "learning_rate": 9.911977158769461e-06, + "loss": 1.1023, + "step": 69 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 0.803430465139554, + "learning_rate": 9.90741945530174e-06, + "loss": 1.1953, + "step": 70 + }, + { + "epoch": 0.20805860805860807, + "grad_norm": 0.9220290395479969, + "learning_rate": 9.90274781690293e-06, + "loss": 1.1115, + "step": 71 + }, + { + "epoch": 0.210989010989011, + "grad_norm": 0.8183474218674964, + "learning_rate": 9.89796235203386e-06, + "loss": 1.1527, + "step": 72 + }, + { + "epoch": 0.2139194139194139, + "grad_norm": 0.7126405845950673, + "learning_rate": 9.89306317179806e-06, + "loss": 1.0852, + "step": 73 + }, + { + "epoch": 0.21684981684981686, + "grad_norm": 0.7601543510464269, + "learning_rate": 9.888050389939172e-06, + "loss": 1.1428, + "step": 74 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 0.8916999196971415, + "learning_rate": 9.882924122838311e-06, + "loss": 1.074, + "step": 75 + }, + { + "epoch": 0.22271062271062272, + "grad_norm": 0.8688542443917322, + "learning_rate": 9.877684489511367e-06, + "loss": 1.0831, + "step": 76 + }, + { + "epoch": 0.22564102564102564, + "grad_norm": 0.8488868510757361, + "learning_rate": 9.872331611606238e-06, + "loss": 1.1393, + "step": 77 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.7370262143749565, + "learning_rate": 9.866865613400008e-06, + "loss": 1.0948, + "step": 78 + }, + { + "epoch": 0.2315018315018315, + "grad_norm": 0.7505253317469636, + "learning_rate": 9.861286621796056e-06, + "loss": 1.1462, + "step": 79 + }, + { + "epoch": 0.23443223443223443, + "grad_norm": 0.9094343725835037, + "learning_rate": 9.855594766321122e-06, + "loss": 1.1325, + "step": 80 + }, + { + "epoch": 0.23736263736263735, + "grad_norm": 0.7886762129451516, + "learning_rate": 9.849790179122287e-06, + "loss": 1.1147, + "step": 81 + }, + { + "epoch": 0.2402930402930403, + "grad_norm": 0.7613394543040073, + "learning_rate": 9.843872994963912e-06, + "loss": 1.0538, + "step": 82 + }, + { + "epoch": 0.24322344322344322, + "grad_norm": 0.754376414929907, + "learning_rate": 9.837843351224504e-06, + "loss": 1.0924, + "step": 83 + }, + { + "epoch": 0.24615384615384617, + "grad_norm": 0.8764280950179428, + "learning_rate": 9.831701387893533e-06, + "loss": 1.1504, + "step": 84 + }, + { + "epoch": 0.2490842490842491, + "grad_norm": 0.695623410235394, + "learning_rate": 9.825447247568177e-06, + "loss": 1.1245, + "step": 85 + }, + { + "epoch": 0.252014652014652, + "grad_norm": 0.6969634561214898, + "learning_rate": 9.819081075450014e-06, + "loss": 1.0707, + "step": 86 + }, + { + "epoch": 0.2549450549450549, + "grad_norm": 0.7156303735303894, + "learning_rate": 9.812603019341652e-06, + "loss": 1.1286, + "step": 87 + }, + { + "epoch": 0.2578754578754579, + "grad_norm": 0.8220793729909976, + "learning_rate": 9.80601322964329e-06, + "loss": 1.1302, + "step": 88 + }, + { + "epoch": 0.2608058608058608, + "grad_norm": 0.9670245180950947, + "learning_rate": 9.799311859349235e-06, + "loss": 1.122, + "step": 89 + }, + { + "epoch": 0.26373626373626374, + "grad_norm": 0.7021589814665796, + "learning_rate": 9.792499064044343e-06, + "loss": 1.0646, + "step": 90 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.7446117525493868, + "learning_rate": 9.785575001900418e-06, + "loss": 1.1038, + "step": 91 + }, + { + "epoch": 0.2695970695970696, + "grad_norm": 0.7963884145520905, + "learning_rate": 9.778539833672525e-06, + "loss": 1.1319, + "step": 92 + }, + { + "epoch": 0.2725274725274725, + "grad_norm": 0.8173788201228416, + "learning_rate": 9.771393722695267e-06, + "loss": 1.1246, + "step": 93 + }, + { + "epoch": 0.2754578754578755, + "grad_norm": 0.740082695855197, + "learning_rate": 9.764136834878987e-06, + "loss": 1.1088, + "step": 94 + }, + { + "epoch": 0.2783882783882784, + "grad_norm": 0.7467817240548424, + "learning_rate": 9.756769338705927e-06, + "loss": 1.0909, + "step": 95 + }, + { + "epoch": 0.2813186813186813, + "grad_norm": 0.7619316456122908, + "learning_rate": 9.749291405226304e-06, + "loss": 1.1233, + "step": 96 + }, + { + "epoch": 0.28424908424908424, + "grad_norm": 0.6936244573726563, + "learning_rate": 9.741703208054347e-06, + "loss": 1.0595, + "step": 97 + }, + { + "epoch": 0.28717948717948716, + "grad_norm": 0.8077496531848941, + "learning_rate": 9.734004923364258e-06, + "loss": 1.1384, + "step": 98 + }, + { + "epoch": 0.29010989010989013, + "grad_norm": 0.705333491953162, + "learning_rate": 9.72619672988613e-06, + "loss": 1.1248, + "step": 99 + }, + { + "epoch": 0.29304029304029305, + "grad_norm": 0.7150948818560614, + "learning_rate": 9.718278808901797e-06, + "loss": 1.0189, + "step": 100 + }, + { + "epoch": 0.295970695970696, + "grad_norm": 0.8190821657116661, + "learning_rate": 9.710251344240619e-06, + "loss": 1.1314, + "step": 101 + }, + { + "epoch": 0.2989010989010989, + "grad_norm": 0.7993491005692807, + "learning_rate": 9.702114522275216e-06, + "loss": 1.1379, + "step": 102 + }, + { + "epoch": 0.3018315018315018, + "grad_norm": 0.77941047322087, + "learning_rate": 9.69386853191715e-06, + "loss": 1.0411, + "step": 103 + }, + { + "epoch": 0.3047619047619048, + "grad_norm": 0.7858691393792162, + "learning_rate": 9.685513564612521e-06, + "loss": 1.1251, + "step": 104 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.7337514917747404, + "learning_rate": 9.67704981433754e-06, + "loss": 1.1191, + "step": 105 + }, + { + "epoch": 0.31062271062271063, + "grad_norm": 0.767317429310955, + "learning_rate": 9.668477477594021e-06, + "loss": 1.093, + "step": 106 + }, + { + "epoch": 0.31355311355311355, + "grad_norm": 0.7668402066152609, + "learning_rate": 9.659796753404806e-06, + "loss": 1.0853, + "step": 107 + }, + { + "epoch": 0.31648351648351647, + "grad_norm": 0.7172206747621077, + "learning_rate": 9.651007843309164e-06, + "loss": 1.0838, + "step": 108 + }, + { + "epoch": 0.3194139194139194, + "grad_norm": 0.7579043380498024, + "learning_rate": 9.642110951358097e-06, + "loss": 1.1491, + "step": 109 + }, + { + "epoch": 0.32234432234432236, + "grad_norm": 0.7733555528433625, + "learning_rate": 9.633106284109612e-06, + "loss": 1.1435, + "step": 110 + }, + { + "epoch": 0.3252747252747253, + "grad_norm": 3.3284789531656016, + "learning_rate": 9.623994050623917e-06, + "loss": 1.0951, + "step": 111 + }, + { + "epoch": 0.3282051282051282, + "grad_norm": 0.7820590656550864, + "learning_rate": 9.614774462458573e-06, + "loss": 1.1173, + "step": 112 + }, + { + "epoch": 0.3311355311355311, + "grad_norm": 0.7056629851514702, + "learning_rate": 9.605447733663586e-06, + "loss": 1.1012, + "step": 113 + }, + { + "epoch": 0.33406593406593404, + "grad_norm": 0.7628154605662072, + "learning_rate": 9.596014080776424e-06, + "loss": 1.1192, + "step": 114 + }, + { + "epoch": 0.336996336996337, + "grad_norm": 0.6924942007287507, + "learning_rate": 9.586473722817002e-06, + "loss": 1.0647, + "step": 115 + }, + { + "epoch": 0.33992673992673994, + "grad_norm": 0.7030223107571508, + "learning_rate": 9.576826881282595e-06, + "loss": 1.1056, + "step": 116 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.7341686688600246, + "learning_rate": 9.56707378014269e-06, + "loss": 1.0371, + "step": 117 + }, + { + "epoch": 0.3457875457875458, + "grad_norm": 0.7727927029586001, + "learning_rate": 9.557214645833792e-06, + "loss": 1.1259, + "step": 118 + }, + { + "epoch": 0.3487179487179487, + "grad_norm": 0.7310833198855431, + "learning_rate": 9.547249707254166e-06, + "loss": 1.0888, + "step": 119 + }, + { + "epoch": 0.3516483516483517, + "grad_norm": 0.6667796341322038, + "learning_rate": 9.537179195758513e-06, + "loss": 1.0823, + "step": 120 + }, + { + "epoch": 0.3545787545787546, + "grad_norm": 0.665405837005823, + "learning_rate": 9.527003345152616e-06, + "loss": 1.0777, + "step": 121 + }, + { + "epoch": 0.3575091575091575, + "grad_norm": 0.6809160644579101, + "learning_rate": 9.516722391687903e-06, + "loss": 1.053, + "step": 122 + }, + { + "epoch": 0.36043956043956044, + "grad_norm": 0.728118239149536, + "learning_rate": 9.506336574055954e-06, + "loss": 1.1185, + "step": 123 + }, + { + "epoch": 0.36336996336996336, + "grad_norm": 0.7065965453050594, + "learning_rate": 9.495846133382973e-06, + "loss": 1.1062, + "step": 124 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 0.7014383586245783, + "learning_rate": 9.485251313224181e-06, + "loss": 1.0878, + "step": 125 + }, + { + "epoch": 0.36923076923076925, + "grad_norm": 0.681702034528789, + "learning_rate": 9.474552359558167e-06, + "loss": 1.0704, + "step": 126 + }, + { + "epoch": 0.37216117216117217, + "grad_norm": 0.7187395064100738, + "learning_rate": 9.463749520781172e-06, + "loss": 1.0812, + "step": 127 + }, + { + "epoch": 0.3750915750915751, + "grad_norm": 0.7184519689161065, + "learning_rate": 9.452843047701324e-06, + "loss": 1.1057, + "step": 128 + }, + { + "epoch": 0.378021978021978, + "grad_norm": 0.7690003114746435, + "learning_rate": 9.441833193532817e-06, + "loss": 1.0689, + "step": 129 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.7164964859738834, + "learning_rate": 9.43072021389003e-06, + "loss": 1.117, + "step": 130 + }, + { + "epoch": 0.3838827838827839, + "grad_norm": 0.6974378217023173, + "learning_rate": 9.419504366781591e-06, + "loss": 1.0734, + "step": 131 + }, + { + "epoch": 0.3868131868131868, + "grad_norm": 0.7254779614689418, + "learning_rate": 9.408185912604395e-06, + "loss": 1.0559, + "step": 132 + }, + { + "epoch": 0.38974358974358975, + "grad_norm": 0.7303990545301333, + "learning_rate": 9.396765114137544e-06, + "loss": 1.0918, + "step": 133 + }, + { + "epoch": 0.39267399267399267, + "grad_norm": 0.6651968867581742, + "learning_rate": 9.385242236536259e-06, + "loss": 1.078, + "step": 134 + }, + { + "epoch": 0.3956043956043956, + "grad_norm": 0.6364456851581658, + "learning_rate": 9.37361754732572e-06, + "loss": 1.0218, + "step": 135 + }, + { + "epoch": 0.39853479853479856, + "grad_norm": 0.8125339033682204, + "learning_rate": 9.36189131639485e-06, + "loss": 1.1092, + "step": 136 + }, + { + "epoch": 0.4014652014652015, + "grad_norm": 0.7445751488678471, + "learning_rate": 9.35006381599006e-06, + "loss": 1.1637, + "step": 137 + }, + { + "epoch": 0.4043956043956044, + "grad_norm": 0.7911370318492177, + "learning_rate": 9.338135320708912e-06, + "loss": 1.1167, + "step": 138 + }, + { + "epoch": 0.4073260073260073, + "grad_norm": 0.7108436127581649, + "learning_rate": 9.326106107493762e-06, + "loss": 1.0756, + "step": 139 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 0.6595709769127521, + "learning_rate": 9.313976455625316e-06, + "loss": 1.0266, + "step": 140 + }, + { + "epoch": 0.41318681318681316, + "grad_norm": 0.7843564857029468, + "learning_rate": 9.301746646716152e-06, + "loss": 1.1095, + "step": 141 + }, + { + "epoch": 0.41611721611721614, + "grad_norm": 0.7585560414429611, + "learning_rate": 9.289416964704186e-06, + "loss": 1.1141, + "step": 142 + }, + { + "epoch": 0.41904761904761906, + "grad_norm": 0.7573484270766634, + "learning_rate": 9.27698769584607e-06, + "loss": 1.0725, + "step": 143 + }, + { + "epoch": 0.421978021978022, + "grad_norm": 0.8449861185856159, + "learning_rate": 9.26445912871055e-06, + "loss": 1.1368, + "step": 144 + }, + { + "epoch": 0.4249084249084249, + "grad_norm": 0.8534050421722909, + "learning_rate": 9.251831554171773e-06, + "loss": 1.0788, + "step": 145 + }, + { + "epoch": 0.4278388278388278, + "grad_norm": 0.7638318082003831, + "learning_rate": 9.239105265402525e-06, + "loss": 1.1393, + "step": 146 + }, + { + "epoch": 0.4307692307692308, + "grad_norm": 0.830680728213633, + "learning_rate": 9.226280557867428e-06, + "loss": 1.1028, + "step": 147 + }, + { + "epoch": 0.4336996336996337, + "grad_norm": 0.6744236125985169, + "learning_rate": 9.213357729316077e-06, + "loss": 1.0371, + "step": 148 + }, + { + "epoch": 0.43663003663003663, + "grad_norm": 0.7708023870173715, + "learning_rate": 9.200337079776136e-06, + "loss": 1.0292, + "step": 149 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 0.7392502681190622, + "learning_rate": 9.187218911546363e-06, + "loss": 1.1144, + "step": 150 + }, + { + "epoch": 0.4424908424908425, + "grad_norm": 0.8223130697058154, + "learning_rate": 9.174003529189592e-06, + "loss": 1.1267, + "step": 151 + }, + { + "epoch": 0.44542124542124545, + "grad_norm": 0.6534093776081946, + "learning_rate": 9.160691239525675e-06, + "loss": 1.0506, + "step": 152 + }, + { + "epoch": 0.44835164835164837, + "grad_norm": 0.7725854247329561, + "learning_rate": 9.147282351624334e-06, + "loss": 1.0608, + "step": 153 + }, + { + "epoch": 0.4512820512820513, + "grad_norm": 0.6816968972584797, + "learning_rate": 9.133777176798013e-06, + "loss": 1.0555, + "step": 154 + }, + { + "epoch": 0.4542124542124542, + "grad_norm": 0.9669303786054149, + "learning_rate": 9.120176028594632e-06, + "loss": 1.1101, + "step": 155 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.7037824443977663, + "learning_rate": 9.106479222790312e-06, + "loss": 1.0933, + "step": 156 + }, + { + "epoch": 0.46007326007326005, + "grad_norm": 0.8568980072792112, + "learning_rate": 9.092687077382043e-06, + "loss": 1.0872, + "step": 157 + }, + { + "epoch": 0.463003663003663, + "grad_norm": 1.0093527220046434, + "learning_rate": 9.078799912580305e-06, + "loss": 1.0833, + "step": 158 + }, + { + "epoch": 0.46593406593406594, + "grad_norm": 0.8145192278685842, + "learning_rate": 9.064818050801634e-06, + "loss": 1.0981, + "step": 159 + }, + { + "epoch": 0.46886446886446886, + "grad_norm": 0.7087077852924482, + "learning_rate": 9.050741816661128e-06, + "loss": 1.0728, + "step": 160 + }, + { + "epoch": 0.4717948717948718, + "grad_norm": 0.8436314316973053, + "learning_rate": 9.036571536964915e-06, + "loss": 1.1254, + "step": 161 + }, + { + "epoch": 0.4747252747252747, + "grad_norm": 0.8154298111795745, + "learning_rate": 9.022307540702576e-06, + "loss": 1.1122, + "step": 162 + }, + { + "epoch": 0.4776556776556777, + "grad_norm": 0.6530298783851548, + "learning_rate": 9.007950159039488e-06, + "loss": 1.0955, + "step": 163 + }, + { + "epoch": 0.4805860805860806, + "grad_norm": 0.7845249406652348, + "learning_rate": 8.993499725309148e-06, + "loss": 1.051, + "step": 164 + }, + { + "epoch": 0.4835164835164835, + "grad_norm": 0.8429786879957137, + "learning_rate": 8.978956575005432e-06, + "loss": 1.1001, + "step": 165 + }, + { + "epoch": 0.48644688644688644, + "grad_norm": 0.7070651182594344, + "learning_rate": 8.964321045774808e-06, + "loss": 1.078, + "step": 166 + }, + { + "epoch": 0.48937728937728936, + "grad_norm": 0.7491716944247487, + "learning_rate": 8.949593477408485e-06, + "loss": 1.0743, + "step": 167 + }, + { + "epoch": 0.49230769230769234, + "grad_norm": 0.8088630208896083, + "learning_rate": 8.934774211834538e-06, + "loss": 1.0151, + "step": 168 + }, + { + "epoch": 0.49523809523809526, + "grad_norm": 0.7439375920034751, + "learning_rate": 8.919863593109967e-06, + "loss": 1.0513, + "step": 169 + }, + { + "epoch": 0.4981684981684982, + "grad_norm": 0.815503037318965, + "learning_rate": 8.904861967412702e-06, + "loss": 1.0963, + "step": 170 + }, + { + "epoch": 0.5010989010989011, + "grad_norm": 0.8588369274428411, + "learning_rate": 8.889769683033573e-06, + "loss": 1.0378, + "step": 171 + }, + { + "epoch": 0.504029304029304, + "grad_norm": 0.7890693152650229, + "learning_rate": 8.874587090368221e-06, + "loss": 1.0849, + "step": 172 + }, + { + "epoch": 0.5069597069597069, + "grad_norm": 0.679463169659677, + "learning_rate": 8.859314541908957e-06, + "loss": 1.0481, + "step": 173 + }, + { + "epoch": 0.5098901098901099, + "grad_norm": 0.7507974578994663, + "learning_rate": 8.843952392236595e-06, + "loss": 1.0806, + "step": 174 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.7459226590677729, + "learning_rate": 8.8285009980122e-06, + "loss": 1.0841, + "step": 175 + }, + { + "epoch": 0.5157509157509158, + "grad_norm": 0.7750302070712438, + "learning_rate": 8.81296071796882e-06, + "loss": 1.0865, + "step": 176 + }, + { + "epoch": 0.5186813186813187, + "grad_norm": 0.7147033919188667, + "learning_rate": 8.797331912903152e-06, + "loss": 1.0832, + "step": 177 + }, + { + "epoch": 0.5216117216117216, + "grad_norm": 0.6594455064471973, + "learning_rate": 8.78161494566717e-06, + "loss": 1.0843, + "step": 178 + }, + { + "epoch": 0.5245421245421246, + "grad_norm": 0.719823747496003, + "learning_rate": 8.765810181159696e-06, + "loss": 1.0775, + "step": 179 + }, + { + "epoch": 0.5274725274725275, + "grad_norm": 0.6848428843395724, + "learning_rate": 8.74991798631793e-06, + "loss": 1.0865, + "step": 180 + }, + { + "epoch": 0.5304029304029304, + "grad_norm": 0.6724971412658745, + "learning_rate": 8.733938730108928e-06, + "loss": 1.061, + "step": 181 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.6900267089498971, + "learning_rate": 8.717872783521048e-06, + "loss": 1.0811, + "step": 182 + }, + { + "epoch": 0.5362637362637362, + "grad_norm": 0.7534508660091974, + "learning_rate": 8.701720519555315e-06, + "loss": 1.0978, + "step": 183 + }, + { + "epoch": 0.5391941391941392, + "grad_norm": 0.7769245974744959, + "learning_rate": 8.685482313216784e-06, + "loss": 1.1021, + "step": 184 + }, + { + "epoch": 0.5421245421245421, + "grad_norm": 0.7339756524664363, + "learning_rate": 8.669158541505817e-06, + "loss": 1.0857, + "step": 185 + }, + { + "epoch": 0.545054945054945, + "grad_norm": 0.6938946896380407, + "learning_rate": 8.65274958340934e-06, + "loss": 1.1024, + "step": 186 + }, + { + "epoch": 0.547985347985348, + "grad_norm": 0.7373481468386694, + "learning_rate": 8.636255819892037e-06, + "loss": 1.0731, + "step": 187 + }, + { + "epoch": 0.550915750915751, + "grad_norm": 0.7356675883617693, + "learning_rate": 8.61967763388751e-06, + "loss": 1.0833, + "step": 188 + }, + { + "epoch": 0.5538461538461539, + "grad_norm": 0.6721670812209259, + "learning_rate": 8.603015410289387e-06, + "loss": 1.0692, + "step": 189 + }, + { + "epoch": 0.5567765567765568, + "grad_norm": 0.7164842748809434, + "learning_rate": 8.586269535942386e-06, + "loss": 1.0276, + "step": 190 + }, + { + "epoch": 0.5597069597069597, + "grad_norm": 0.7138024169545026, + "learning_rate": 8.569440399633337e-06, + "loss": 1.0394, + "step": 191 + }, + { + "epoch": 0.5626373626373626, + "grad_norm": 0.7406122273657452, + "learning_rate": 8.552528392082147e-06, + "loss": 1.1192, + "step": 192 + }, + { + "epoch": 0.5655677655677656, + "grad_norm": 0.6235431206762926, + "learning_rate": 8.535533905932739e-06, + "loss": 1.0834, + "step": 193 + }, + { + "epoch": 0.5684981684981685, + "grad_norm": 0.7124256759895987, + "learning_rate": 8.518457335743927e-06, + "loss": 1.0753, + "step": 194 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.6677579355944584, + "learning_rate": 8.50129907798026e-06, + "loss": 1.064, + "step": 195 + }, + { + "epoch": 0.5743589743589743, + "grad_norm": 0.7499647765747881, + "learning_rate": 8.484059531002822e-06, + "loss": 1.1573, + "step": 196 + }, + { + "epoch": 0.5772893772893772, + "grad_norm": 0.7294026788868488, + "learning_rate": 8.46673909505997e-06, + "loss": 1.1011, + "step": 197 + }, + { + "epoch": 0.5802197802197803, + "grad_norm": 0.7114933216227827, + "learning_rate": 8.44933817227806e-06, + "loss": 1.0818, + "step": 198 + }, + { + "epoch": 0.5831501831501832, + "grad_norm": 0.7768894842987932, + "learning_rate": 8.43185716665209e-06, + "loss": 1.089, + "step": 199 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 0.7115463290600534, + "learning_rate": 8.41429648403634e-06, + "loss": 1.1403, + "step": 200 + }, + { + "epoch": 0.589010989010989, + "grad_norm": 0.744930428011927, + "learning_rate": 8.396656532134934e-06, + "loss": 1.0643, + "step": 201 + }, + { + "epoch": 0.591941391941392, + "grad_norm": 0.7155769890648402, + "learning_rate": 8.378937720492384e-06, + "loss": 1.0786, + "step": 202 + }, + { + "epoch": 0.5948717948717949, + "grad_norm": 0.7127918438707657, + "learning_rate": 8.361140460484081e-06, + "loss": 1.0324, + "step": 203 + }, + { + "epoch": 0.5978021978021978, + "grad_norm": 0.7402459212974355, + "learning_rate": 8.343265165306736e-06, + "loss": 1.0155, + "step": 204 + }, + { + "epoch": 0.6007326007326007, + "grad_norm": 0.7886204426553328, + "learning_rate": 8.325312249968795e-06, + "loss": 1.0667, + "step": 205 + }, + { + "epoch": 0.6036630036630036, + "grad_norm": 0.9134639382550404, + "learning_rate": 8.307282131280805e-06, + "loss": 1.0971, + "step": 206 + }, + { + "epoch": 0.6065934065934065, + "grad_norm": 0.7673543125270567, + "learning_rate": 8.289175227845727e-06, + "loss": 1.0703, + "step": 207 + }, + { + "epoch": 0.6095238095238096, + "grad_norm": 0.7709392914165542, + "learning_rate": 8.270991960049231e-06, + "loss": 1.0553, + "step": 208 + }, + { + "epoch": 0.6124542124542125, + "grad_norm": 0.7555209074220831, + "learning_rate": 8.252732750049918e-06, + "loss": 1.0162, + "step": 209 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.7247403658877494, + "learning_rate": 8.234398021769541e-06, + "loss": 1.0665, + "step": 210 + }, + { + "epoch": 0.6183150183150183, + "grad_norm": 0.7491705240437763, + "learning_rate": 8.215988200883145e-06, + "loss": 1.0822, + "step": 211 + }, + { + "epoch": 0.6212454212454213, + "grad_norm": 0.7786072749364591, + "learning_rate": 8.19750371480919e-06, + "loss": 1.0759, + "step": 212 + }, + { + "epoch": 0.6241758241758242, + "grad_norm": 0.7502719607254296, + "learning_rate": 8.178944992699632e-06, + "loss": 1.0494, + "step": 213 + }, + { + "epoch": 0.6271062271062271, + "grad_norm": 0.980326365407937, + "learning_rate": 8.160312465429952e-06, + "loss": 1.0222, + "step": 214 + }, + { + "epoch": 0.63003663003663, + "grad_norm": 0.8323917117064283, + "learning_rate": 8.141606565589157e-06, + "loss": 1.0473, + "step": 215 + }, + { + "epoch": 0.6329670329670329, + "grad_norm": 0.711816328710808, + "learning_rate": 8.122827727469737e-06, + "loss": 1.0875, + "step": 216 + }, + { + "epoch": 0.6358974358974359, + "grad_norm": 0.7441109264008124, + "learning_rate": 8.103976387057579e-06, + "loss": 1.0823, + "step": 217 + }, + { + "epoch": 0.6388278388278388, + "grad_norm": 0.7262059806258652, + "learning_rate": 8.085052982021849e-06, + "loss": 1.0286, + "step": 218 + }, + { + "epoch": 0.6417582417582418, + "grad_norm": 0.7928850120911566, + "learning_rate": 8.066057951704821e-06, + "loss": 1.123, + "step": 219 + }, + { + "epoch": 0.6446886446886447, + "grad_norm": 0.7685198156864494, + "learning_rate": 8.046991737111696e-06, + "loss": 1.0818, + "step": 220 + }, + { + "epoch": 0.6476190476190476, + "grad_norm": 0.7198626738261616, + "learning_rate": 8.027854780900339e-06, + "loss": 1.0191, + "step": 221 + }, + { + "epoch": 0.6505494505494506, + "grad_norm": 0.7326930302530626, + "learning_rate": 8.008647527371022e-06, + "loss": 1.0473, + "step": 222 + }, + { + "epoch": 0.6534798534798535, + "grad_norm": 0.7625974111690114, + "learning_rate": 7.989370422456098e-06, + "loss": 1.0573, + "step": 223 + }, + { + "epoch": 0.6564102564102564, + "grad_norm": 0.7927397844502478, + "learning_rate": 7.970023913709652e-06, + "loss": 1.0741, + "step": 224 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 0.8105334583256553, + "learning_rate": 7.95060845029711e-06, + "loss": 1.0406, + "step": 225 + }, + { + "epoch": 0.6622710622710622, + "grad_norm": 0.819096758560569, + "learning_rate": 7.931124482984802e-06, + "loss": 1.0952, + "step": 226 + }, + { + "epoch": 0.6652014652014652, + "grad_norm": 0.7225216871665473, + "learning_rate": 7.911572464129515e-06, + "loss": 1.0514, + "step": 227 + }, + { + "epoch": 0.6681318681318681, + "grad_norm": 0.8288654068393345, + "learning_rate": 7.891952847667973e-06, + "loss": 1.1164, + "step": 228 + }, + { + "epoch": 0.671062271062271, + "grad_norm": 0.6759229558936937, + "learning_rate": 7.872266089106309e-06, + "loss": 1.0495, + "step": 229 + }, + { + "epoch": 0.673992673992674, + "grad_norm": 0.7292237419206736, + "learning_rate": 7.85251264550948e-06, + "loss": 1.0418, + "step": 230 + }, + { + "epoch": 0.676923076923077, + "grad_norm": 0.8129512318365157, + "learning_rate": 7.832692975490668e-06, + "loss": 1.0824, + "step": 231 + }, + { + "epoch": 0.6798534798534799, + "grad_norm": 0.7437869057766316, + "learning_rate": 7.812807539200622e-06, + "loss": 1.096, + "step": 232 + }, + { + "epoch": 0.6827838827838828, + "grad_norm": 0.7490823810391557, + "learning_rate": 7.792856798316975e-06, + "loss": 1.0763, + "step": 233 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.7898358276478571, + "learning_rate": 7.772841216033534e-06, + "loss": 1.0771, + "step": 234 + }, + { + "epoch": 0.6886446886446886, + "grad_norm": 0.6343598620923301, + "learning_rate": 7.752761257049517e-06, + "loss": 1.0322, + "step": 235 + }, + { + "epoch": 0.6915750915750916, + "grad_norm": 0.7433500237046824, + "learning_rate": 7.732617387558769e-06, + "loss": 1.0797, + "step": 236 + }, + { + "epoch": 0.6945054945054945, + "grad_norm": 0.748028902224525, + "learning_rate": 7.712410075238936e-06, + "loss": 1.0574, + "step": 237 + }, + { + "epoch": 0.6974358974358974, + "grad_norm": 0.7738403008694008, + "learning_rate": 7.692139789240611e-06, + "loss": 1.1001, + "step": 238 + }, + { + "epoch": 0.7003663003663003, + "grad_norm": 0.7228116763923162, + "learning_rate": 7.671807000176434e-06, + "loss": 1.0799, + "step": 239 + }, + { + "epoch": 0.7032967032967034, + "grad_norm": 0.7253527243733535, + "learning_rate": 7.651412180110176e-06, + "loss": 1.0707, + "step": 240 + }, + { + "epoch": 0.7062271062271063, + "grad_norm": 0.7348620682799656, + "learning_rate": 7.630955802545768e-06, + "loss": 1.0474, + "step": 241 + }, + { + "epoch": 0.7091575091575092, + "grad_norm": 0.6836835745144434, + "learning_rate": 7.61043834241632e-06, + "loss": 1.0812, + "step": 242 + }, + { + "epoch": 0.7120879120879121, + "grad_norm": 0.6837971336343311, + "learning_rate": 7.589860276073082e-06, + "loss": 1.0473, + "step": 243 + }, + { + "epoch": 0.715018315018315, + "grad_norm": 0.7230704080796915, + "learning_rate": 7.569222081274396e-06, + "loss": 1.1134, + "step": 244 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 0.7127149830099806, + "learning_rate": 7.548524237174595e-06, + "loss": 1.064, + "step": 245 + }, + { + "epoch": 0.7208791208791209, + "grad_norm": 0.7393242144731675, + "learning_rate": 7.527767224312883e-06, + "loss": 1.055, + "step": 246 + }, + { + "epoch": 0.7238095238095238, + "grad_norm": 0.7070740978028522, + "learning_rate": 7.50695152460218e-06, + "loss": 1.0514, + "step": 247 + }, + { + "epoch": 0.7267399267399267, + "grad_norm": 0.6285827841364233, + "learning_rate": 7.4860776213179264e-06, + "loss": 1.0663, + "step": 248 + }, + { + "epoch": 0.7296703296703296, + "grad_norm": 0.722505271587716, + "learning_rate": 7.465145999086874e-06, + "loss": 1.0247, + "step": 249 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 0.7574691065276626, + "learning_rate": 7.44415714387582e-06, + "loss": 1.0806, + "step": 250 + }, + { + "epoch": 0.7355311355311356, + "grad_norm": 0.6671911844141701, + "learning_rate": 7.4231115429803405e-06, + "loss": 1.0718, + "step": 251 + }, + { + "epoch": 0.7384615384615385, + "grad_norm": 0.6739232630180421, + "learning_rate": 7.4020096850134635e-06, + "loss": 1.054, + "step": 252 + }, + { + "epoch": 0.7413919413919414, + "grad_norm": 0.6816160363185811, + "learning_rate": 7.380852059894331e-06, + "loss": 1.0135, + "step": 253 + }, + { + "epoch": 0.7443223443223443, + "grad_norm": 0.7255625077642197, + "learning_rate": 7.359639158836828e-06, + "loss": 1.0765, + "step": 254 + }, + { + "epoch": 0.7472527472527473, + "grad_norm": 0.7389412886246451, + "learning_rate": 7.3383714743381664e-06, + "loss": 1.0734, + "step": 255 + }, + { + "epoch": 0.7501831501831502, + "grad_norm": 0.6974028362998995, + "learning_rate": 7.317049500167466e-06, + "loss": 1.0225, + "step": 256 + }, + { + "epoch": 0.7531135531135531, + "grad_norm": 0.706462620691865, + "learning_rate": 7.295673731354278e-06, + "loss": 1.0433, + "step": 257 + }, + { + "epoch": 0.756043956043956, + "grad_norm": 0.6939088050227432, + "learning_rate": 7.2742446641770985e-06, + "loss": 1.0283, + "step": 258 + }, + { + "epoch": 0.7589743589743589, + "grad_norm": 0.6738265352397054, + "learning_rate": 7.252762796151843e-06, + "loss": 1.0338, + "step": 259 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.7509847899569063, + "learning_rate": 7.231228626020303e-06, + "loss": 1.0851, + "step": 260 + }, + { + "epoch": 0.7648351648351648, + "grad_norm": 0.7111929689141302, + "learning_rate": 7.209642653738555e-06, + "loss": 1.0467, + "step": 261 + }, + { + "epoch": 0.7677655677655678, + "grad_norm": 0.6700991516973885, + "learning_rate": 7.188005380465365e-06, + "loss": 1.0886, + "step": 262 + }, + { + "epoch": 0.7706959706959707, + "grad_norm": 0.7620776406324662, + "learning_rate": 7.166317308550541e-06, + "loss": 1.0914, + "step": 263 + }, + { + "epoch": 0.7736263736263737, + "grad_norm": 0.6438553939939631, + "learning_rate": 7.144578941523283e-06, + "loss": 1.0139, + "step": 264 + }, + { + "epoch": 0.7765567765567766, + "grad_norm": 0.6627461325502493, + "learning_rate": 7.122790784080482e-06, + "loss": 0.978, + "step": 265 + }, + { + "epoch": 0.7794871794871795, + "grad_norm": 0.7162460216190278, + "learning_rate": 7.10095334207501e-06, + "loss": 1.0949, + "step": 266 + }, + { + "epoch": 0.7824175824175824, + "grad_norm": 0.6889558669766283, + "learning_rate": 7.079067122503969e-06, + "loss": 1.0872, + "step": 267 + }, + { + "epoch": 0.7853479853479853, + "grad_norm": 0.6988411158347763, + "learning_rate": 7.057132633496924e-06, + "loss": 1.0609, + "step": 268 + }, + { + "epoch": 0.7882783882783883, + "grad_norm": 0.8465993660592789, + "learning_rate": 7.03515038430411e-06, + "loss": 1.0754, + "step": 269 + }, + { + "epoch": 0.7912087912087912, + "grad_norm": 0.7433786645831059, + "learning_rate": 7.013120885284599e-06, + "loss": 1.0521, + "step": 270 + }, + { + "epoch": 0.7941391941391941, + "grad_norm": 0.6471784629568724, + "learning_rate": 6.991044647894456e-06, + "loss": 1.0222, + "step": 271 + }, + { + "epoch": 0.7970695970695971, + "grad_norm": 0.7717525416061395, + "learning_rate": 6.968922184674868e-06, + "loss": 1.0598, + "step": 272 + }, + { + "epoch": 0.8, + "grad_norm": 0.7375028451227608, + "learning_rate": 6.946754009240241e-06, + "loss": 1.0879, + "step": 273 + }, + { + "epoch": 0.802930402930403, + "grad_norm": 0.7395816953663372, + "learning_rate": 6.924540636266272e-06, + "loss": 1.0495, + "step": 274 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 0.6534788778148224, + "learning_rate": 6.902282581478009e-06, + "loss": 1.0484, + "step": 275 + }, + { + "epoch": 0.8087912087912088, + "grad_norm": 0.6831894193923664, + "learning_rate": 6.879980361637865e-06, + "loss": 1.0709, + "step": 276 + }, + { + "epoch": 0.8117216117216117, + "grad_norm": 0.7986776852595957, + "learning_rate": 6.857634494533636e-06, + "loss": 1.0989, + "step": 277 + }, + { + "epoch": 0.8146520146520146, + "grad_norm": 0.7138986323229565, + "learning_rate": 6.835245498966461e-06, + "loss": 1.1026, + "step": 278 + }, + { + "epoch": 0.8175824175824176, + "grad_norm": 0.7650967662850501, + "learning_rate": 6.8128138947387966e-06, + "loss": 1.0906, + "step": 279 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 0.6705870456445562, + "learning_rate": 6.790340202642333e-06, + "loss": 1.0306, + "step": 280 + }, + { + "epoch": 0.8234432234432234, + "grad_norm": 0.6924029470535028, + "learning_rate": 6.76782494444591e-06, + "loss": 1.0682, + "step": 281 + }, + { + "epoch": 0.8263736263736263, + "grad_norm": 0.7640179483164876, + "learning_rate": 6.7452686428834045e-06, + "loss": 1.0251, + "step": 282 + }, + { + "epoch": 0.8293040293040294, + "grad_norm": 0.6821634834350029, + "learning_rate": 6.722671821641591e-06, + "loss": 1.0446, + "step": 283 + }, + { + "epoch": 0.8322344322344323, + "grad_norm": 0.7451959173472754, + "learning_rate": 6.700035005347983e-06, + "loss": 1.1235, + "step": 284 + }, + { + "epoch": 0.8351648351648352, + "grad_norm": 0.6649020603168196, + "learning_rate": 6.677358719558655e-06, + "loss": 0.9895, + "step": 285 + }, + { + "epoch": 0.8380952380952381, + "grad_norm": 0.710844339419014, + "learning_rate": 6.654643490746042e-06, + "loss": 1.0464, + "step": 286 + }, + { + "epoch": 0.841025641025641, + "grad_norm": 0.6558219608160121, + "learning_rate": 6.6318898462867086e-06, + "loss": 1.0789, + "step": 287 + }, + { + "epoch": 0.843956043956044, + "grad_norm": 0.7103315959032128, + "learning_rate": 6.609098314449116e-06, + "loss": 1.0693, + "step": 288 + }, + { + "epoch": 0.8468864468864469, + "grad_norm": 0.6642938199213403, + "learning_rate": 6.586269424381349e-06, + "loss": 1.0111, + "step": 289 + }, + { + "epoch": 0.8498168498168498, + "grad_norm": 0.6686652730007528, + "learning_rate": 6.563403706098833e-06, + "loss": 1.0493, + "step": 290 + }, + { + "epoch": 0.8527472527472527, + "grad_norm": 0.707080494234748, + "learning_rate": 6.540501690472032e-06, + "loss": 1.0933, + "step": 291 + }, + { + "epoch": 0.8556776556776556, + "grad_norm": 0.8070591206820166, + "learning_rate": 6.517563909214119e-06, + "loss": 1.0598, + "step": 292 + }, + { + "epoch": 0.8586080586080586, + "grad_norm": 0.6443876519255367, + "learning_rate": 6.4945908948686355e-06, + "loss": 1.0436, + "step": 293 + }, + { + "epoch": 0.8615384615384616, + "grad_norm": 0.6358308118659253, + "learning_rate": 6.471583180797121e-06, + "loss": 1.0205, + "step": 294 + }, + { + "epoch": 0.8644688644688645, + "grad_norm": 0.6980748843159623, + "learning_rate": 6.448541301166737e-06, + "loss": 1.0481, + "step": 295 + }, + { + "epoch": 0.8673992673992674, + "grad_norm": 0.721201397418447, + "learning_rate": 6.4254657909378615e-06, + "loss": 1.0533, + "step": 296 + }, + { + "epoch": 0.8703296703296703, + "grad_norm": 0.6902627729970224, + "learning_rate": 6.402357185851669e-06, + "loss": 1.0786, + "step": 297 + }, + { + "epoch": 0.8732600732600733, + "grad_norm": 0.7134804889991092, + "learning_rate": 6.379216022417695e-06, + "loss": 1.0654, + "step": 298 + }, + { + "epoch": 0.8761904761904762, + "grad_norm": 0.6409021568091943, + "learning_rate": 6.3560428379013795e-06, + "loss": 1.0456, + "step": 299 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 0.7159300653418184, + "learning_rate": 6.332838170311586e-06, + "loss": 1.0795, + "step": 300 + }, + { + "epoch": 0.882051282051282, + "grad_norm": 0.6634422162154384, + "learning_rate": 6.309602558388122e-06, + "loss": 1.0906, + "step": 301 + }, + { + "epoch": 0.884981684981685, + "grad_norm": 0.6748316902652748, + "learning_rate": 6.286336541589224e-06, + "loss": 1.0344, + "step": 302 + }, + { + "epoch": 0.8879120879120879, + "grad_norm": 0.762311805582648, + "learning_rate": 6.263040660079031e-06, + "loss": 1.0328, + "step": 303 + }, + { + "epoch": 0.8908424908424909, + "grad_norm": 0.6567073940057072, + "learning_rate": 6.239715454715054e-06, + "loss": 1.0326, + "step": 304 + }, + { + "epoch": 0.8937728937728938, + "grad_norm": 0.7360191370508475, + "learning_rate": 6.216361467035608e-06, + "loss": 1.0671, + "step": 305 + }, + { + "epoch": 0.8967032967032967, + "grad_norm": 0.6536751414819448, + "learning_rate": 6.192979239247243e-06, + "loss": 1.04, + "step": 306 + }, + { + "epoch": 0.8996336996336997, + "grad_norm": 0.688711476034779, + "learning_rate": 6.169569314212157e-06, + "loss": 1.0425, + "step": 307 + }, + { + "epoch": 0.9025641025641026, + "grad_norm": 0.7128048107980386, + "learning_rate": 6.146132235435591e-06, + "loss": 1.0479, + "step": 308 + }, + { + "epoch": 0.9054945054945055, + "grad_norm": 0.7027292353974949, + "learning_rate": 6.1226685470532125e-06, + "loss": 0.9922, + "step": 309 + }, + { + "epoch": 0.9084249084249084, + "grad_norm": 0.662332343110812, + "learning_rate": 6.099178793818479e-06, + "loss": 1.0564, + "step": 310 + }, + { + "epoch": 0.9113553113553113, + "grad_norm": 0.6578408725334101, + "learning_rate": 6.075663521089994e-06, + "loss": 1.0408, + "step": 311 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.7225530601139032, + "learning_rate": 6.0521232748188416e-06, + "loss": 1.0386, + "step": 312 + }, + { + "epoch": 0.9172161172161172, + "grad_norm": 0.6884615641249574, + "learning_rate": 6.028558601535915e-06, + "loss": 1.0422, + "step": 313 + }, + { + "epoch": 0.9201465201465201, + "grad_norm": 0.6237151400719672, + "learning_rate": 6.0049700483392256e-06, + "loss": 0.9923, + "step": 314 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.6729617533435394, + "learning_rate": 5.981358162881202e-06, + "loss": 1.0802, + "step": 315 + }, + { + "epoch": 0.926007326007326, + "grad_norm": 0.6283161670002942, + "learning_rate": 5.957723493355977e-06, + "loss": 1.0057, + "step": 316 + }, + { + "epoch": 0.928937728937729, + "grad_norm": 0.7106310162079014, + "learning_rate": 5.9340665884866535e-06, + "loss": 1.0452, + "step": 317 + }, + { + "epoch": 0.9318681318681319, + "grad_norm": 0.6307561472458387, + "learning_rate": 5.910387997512573e-06, + "loss": 1.0678, + "step": 318 + }, + { + "epoch": 0.9347985347985348, + "grad_norm": 0.6660079952439634, + "learning_rate": 5.8866882701765605e-06, + "loss": 1.011, + "step": 319 + }, + { + "epoch": 0.9377289377289377, + "grad_norm": 0.6621973288526832, + "learning_rate": 5.86296795671216e-06, + "loss": 1.0777, + "step": 320 + }, + { + "epoch": 0.9406593406593406, + "grad_norm": 0.6542364386949882, + "learning_rate": 5.839227607830862e-06, + "loss": 1.0337, + "step": 321 + }, + { + "epoch": 0.9435897435897436, + "grad_norm": 0.6839429524505124, + "learning_rate": 5.815467774709314e-06, + "loss": 1.0398, + "step": 322 + }, + { + "epoch": 0.9465201465201465, + "grad_norm": 0.7895999257535952, + "learning_rate": 5.791689008976531e-06, + "loss": 1.0965, + "step": 323 + }, + { + "epoch": 0.9494505494505494, + "grad_norm": 0.7076974699228543, + "learning_rate": 5.767891862701081e-06, + "loss": 1.0508, + "step": 324 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.6560722941227365, + "learning_rate": 5.744076888378272e-06, + "loss": 1.0347, + "step": 325 + }, + { + "epoch": 0.9553113553113554, + "grad_norm": 0.7153841740442476, + "learning_rate": 5.7202446389173225e-06, + "loss": 1.048, + "step": 326 + }, + { + "epoch": 0.9582417582417583, + "grad_norm": 0.6840752335867581, + "learning_rate": 5.696395667628526e-06, + "loss": 1.0706, + "step": 327 + }, + { + "epoch": 0.9611721611721612, + "grad_norm": 0.7367756826844876, + "learning_rate": 5.672530528210405e-06, + "loss": 1.0631, + "step": 328 + }, + { + "epoch": 0.9641025641025641, + "grad_norm": 0.739456837074914, + "learning_rate": 5.648649774736855e-06, + "loss": 1.0655, + "step": 329 + }, + { + "epoch": 0.967032967032967, + "grad_norm": 0.7489432381420208, + "learning_rate": 5.624753961644281e-06, + "loss": 1.0103, + "step": 330 + }, + { + "epoch": 0.96996336996337, + "grad_norm": 0.9214056170953451, + "learning_rate": 5.600843643718728e-06, + "loss": 1.0586, + "step": 331 + }, + { + "epoch": 0.9728937728937729, + "grad_norm": 0.6946485880358703, + "learning_rate": 5.57691937608299e-06, + "loss": 1.0673, + "step": 332 + }, + { + "epoch": 0.9758241758241758, + "grad_norm": 0.6814888711345364, + "learning_rate": 5.552981714183738e-06, + "loss": 1.0312, + "step": 333 + }, + { + "epoch": 0.9787545787545787, + "grad_norm": 0.7851305269555706, + "learning_rate": 5.529031213778615e-06, + "loss": 1.0929, + "step": 334 + }, + { + "epoch": 0.9816849816849816, + "grad_norm": 0.698622025095992, + "learning_rate": 5.505068430923327e-06, + "loss": 1.0721, + "step": 335 + }, + { + "epoch": 0.9846153846153847, + "grad_norm": 0.6534426116496003, + "learning_rate": 5.481093921958749e-06, + "loss": 1.0953, + "step": 336 + }, + { + "epoch": 0.9875457875457876, + "grad_norm": 0.6625287425989687, + "learning_rate": 5.457108243497993e-06, + "loss": 0.9991, + "step": 337 + }, + { + "epoch": 0.9904761904761905, + "grad_norm": 0.701327887950468, + "learning_rate": 5.433111952413496e-06, + "loss": 1.0658, + "step": 338 + }, + { + "epoch": 0.9934065934065934, + "grad_norm": 0.6716839301392191, + "learning_rate": 5.409105605824082e-06, + "loss": 1.0413, + "step": 339 + }, + { + "epoch": 0.9963369963369964, + "grad_norm": 0.7609181677574756, + "learning_rate": 5.385089761082039e-06, + "loss": 0.9849, + "step": 340 + }, + { + "epoch": 0.9992673992673993, + "grad_norm": 0.6928967498595602, + "learning_rate": 5.361064975760166e-06, + "loss": 1.025, + "step": 341 + }, + { + "epoch": 1.0021978021978022, + "grad_norm": 0.6340076640585811, + "learning_rate": 5.3370318076388405e-06, + "loss": 1.06, + "step": 342 + }, + { + "epoch": 1.005128205128205, + "grad_norm": 0.7147887444305453, + "learning_rate": 5.3129908146930565e-06, + "loss": 1.0402, + "step": 343 + }, + { + "epoch": 1.008058608058608, + "grad_norm": 0.7081726636264288, + "learning_rate": 5.288942555079479e-06, + "loss": 1.0214, + "step": 344 + }, + { + "epoch": 1.010989010989011, + "grad_norm": 0.6931248026006267, + "learning_rate": 5.264887587123483e-06, + "loss": 1.0355, + "step": 345 + }, + { + "epoch": 1.0139194139194139, + "grad_norm": 0.6614729802440827, + "learning_rate": 5.240826469306187e-06, + "loss": 1.0336, + "step": 346 + }, + { + "epoch": 1.0168498168498168, + "grad_norm": 0.72843052045215, + "learning_rate": 5.216759760251493e-06, + "loss": 1.0959, + "step": 347 + }, + { + "epoch": 1.002930402930403, + "grad_norm": 0.9856684936564257, + "learning_rate": 5.1926880187131134e-06, + "loss": 0.8665, + "step": 348 + }, + { + "epoch": 1.0058608058608058, + "grad_norm": 0.8924138774764413, + "learning_rate": 5.168611803561599e-06, + "loss": 0.8482, + "step": 349 + }, + { + "epoch": 1.0087912087912088, + "grad_norm": 0.80619364515724, + "learning_rate": 5.144531673771364e-06, + "loss": 0.904, + "step": 350 + }, + { + "epoch": 1.0117216117216117, + "grad_norm": 0.8307330430576558, + "learning_rate": 5.1204481884077075e-06, + "loss": 0.824, + "step": 351 + }, + { + "epoch": 1.0146520146520146, + "grad_norm": 0.8223666575265466, + "learning_rate": 5.096361906613836e-06, + "loss": 0.8194, + "step": 352 + }, + { + "epoch": 1.0175824175824175, + "grad_norm": 0.882567742233647, + "learning_rate": 5.072273387597877e-06, + "loss": 0.8007, + "step": 353 + }, + { + "epoch": 1.0205128205128204, + "grad_norm": 0.852155527533341, + "learning_rate": 5.048183190619904e-06, + "loss": 0.8299, + "step": 354 + }, + { + "epoch": 1.0234432234432234, + "grad_norm": 0.6698748000250594, + "learning_rate": 5.0240918749789455e-06, + "loss": 0.8609, + "step": 355 + }, + { + "epoch": 1.0263736263736263, + "grad_norm": 0.740093168594684, + "learning_rate": 5e-06, + "loss": 0.8179, + "step": 356 + }, + { + "epoch": 1.0293040293040292, + "grad_norm": 0.9241467975042712, + "learning_rate": 4.975908125021055e-06, + "loss": 0.8817, + "step": 357 + }, + { + "epoch": 1.0322344322344323, + "grad_norm": 0.8242101021976431, + "learning_rate": 4.951816809380098e-06, + "loss": 0.8461, + "step": 358 + }, + { + "epoch": 1.0351648351648353, + "grad_norm": 0.6778411446141664, + "learning_rate": 4.9277266124021245e-06, + "loss": 0.8251, + "step": 359 + }, + { + "epoch": 1.0380952380952382, + "grad_norm": 0.8162644593862449, + "learning_rate": 4.903638093386167e-06, + "loss": 0.8391, + "step": 360 + }, + { + "epoch": 1.041025641025641, + "grad_norm": 0.6831525068683001, + "learning_rate": 4.879551811592295e-06, + "loss": 0.8454, + "step": 361 + }, + { + "epoch": 1.043956043956044, + "grad_norm": 0.7366137812958817, + "learning_rate": 4.855468326228638e-06, + "loss": 0.8383, + "step": 362 + }, + { + "epoch": 1.046886446886447, + "grad_norm": 0.7091031619746667, + "learning_rate": 4.831388196438402e-06, + "loss": 0.826, + "step": 363 + }, + { + "epoch": 1.0498168498168499, + "grad_norm": 0.6708047011362692, + "learning_rate": 4.807311981286888e-06, + "loss": 0.7795, + "step": 364 + }, + { + "epoch": 1.0527472527472528, + "grad_norm": 0.7189951484271022, + "learning_rate": 4.783240239748509e-06, + "loss": 0.8212, + "step": 365 + }, + { + "epoch": 1.0556776556776557, + "grad_norm": 0.6929263617105769, + "learning_rate": 4.7591735306938144e-06, + "loss": 0.8404, + "step": 366 + }, + { + "epoch": 1.0586080586080586, + "grad_norm": 0.7090536517310597, + "learning_rate": 4.7351124128765185e-06, + "loss": 0.7743, + "step": 367 + }, + { + "epoch": 1.0615384615384615, + "grad_norm": 0.7200636917743535, + "learning_rate": 4.711057444920522e-06, + "loss": 0.8388, + "step": 368 + }, + { + "epoch": 1.0644688644688645, + "grad_norm": 0.710980769195996, + "learning_rate": 4.687009185306945e-06, + "loss": 0.8322, + "step": 369 + }, + { + "epoch": 1.0673992673992674, + "grad_norm": 0.6240248501038852, + "learning_rate": 4.662968192361161e-06, + "loss": 0.7604, + "step": 370 + }, + { + "epoch": 1.0703296703296703, + "grad_norm": 0.6841882570704744, + "learning_rate": 4.6389350242398354e-06, + "loss": 0.8292, + "step": 371 + }, + { + "epoch": 1.0732600732600732, + "grad_norm": 0.6356314258852013, + "learning_rate": 4.614910238917963e-06, + "loss": 0.8112, + "step": 372 + }, + { + "epoch": 1.0761904761904761, + "grad_norm": 0.6777660171190972, + "learning_rate": 4.59089439417592e-06, + "loss": 0.8422, + "step": 373 + }, + { + "epoch": 1.079120879120879, + "grad_norm": 0.7302941927547204, + "learning_rate": 4.5668880475865074e-06, + "loss": 0.844, + "step": 374 + }, + { + "epoch": 1.082051282051282, + "grad_norm": 0.7403000182698058, + "learning_rate": 4.542891756502008e-06, + "loss": 0.8252, + "step": 375 + }, + { + "epoch": 1.084981684981685, + "grad_norm": 0.6270811120486014, + "learning_rate": 4.518906078041252e-06, + "loss": 0.8351, + "step": 376 + }, + { + "epoch": 1.0879120879120878, + "grad_norm": 0.6128334495594291, + "learning_rate": 4.494931569076674e-06, + "loss": 0.7953, + "step": 377 + }, + { + "epoch": 1.0908424908424907, + "grad_norm": 0.6647669320211729, + "learning_rate": 4.4709687862213866e-06, + "loss": 0.8271, + "step": 378 + }, + { + "epoch": 1.0937728937728939, + "grad_norm": 0.6015242764545017, + "learning_rate": 4.447018285816263e-06, + "loss": 0.7774, + "step": 379 + }, + { + "epoch": 1.0967032967032968, + "grad_norm": 0.6850007287026474, + "learning_rate": 4.423080623917012e-06, + "loss": 0.8323, + "step": 380 + }, + { + "epoch": 1.0996336996336997, + "grad_norm": 0.7137348622498122, + "learning_rate": 4.399156356281274e-06, + "loss": 0.8385, + "step": 381 + }, + { + "epoch": 1.1025641025641026, + "grad_norm": 0.6268533396942314, + "learning_rate": 4.3752460383557195e-06, + "loss": 0.7795, + "step": 382 + }, + { + "epoch": 1.1054945054945056, + "grad_norm": 0.6553350909418485, + "learning_rate": 4.351350225263147e-06, + "loss": 0.7732, + "step": 383 + }, + { + "epoch": 1.1084249084249085, + "grad_norm": 0.7239076024187311, + "learning_rate": 4.327469471789597e-06, + "loss": 0.7988, + "step": 384 + }, + { + "epoch": 1.1113553113553114, + "grad_norm": 0.6603332573386838, + "learning_rate": 4.303604332371476e-06, + "loss": 0.8234, + "step": 385 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.6836658490120479, + "learning_rate": 4.27975536108268e-06, + "loss": 0.8262, + "step": 386 + }, + { + "epoch": 1.1172161172161172, + "grad_norm": 0.6782684413264028, + "learning_rate": 4.2559231116217305e-06, + "loss": 0.8224, + "step": 387 + }, + { + "epoch": 1.1201465201465202, + "grad_norm": 0.6700675127409191, + "learning_rate": 4.2321081372989195e-06, + "loss": 0.7931, + "step": 388 + }, + { + "epoch": 1.123076923076923, + "grad_norm": 0.6991274806941559, + "learning_rate": 4.208310991023469e-06, + "loss": 0.8165, + "step": 389 + }, + { + "epoch": 1.126007326007326, + "grad_norm": 0.6764322388131341, + "learning_rate": 4.184532225290687e-06, + "loss": 0.7884, + "step": 390 + }, + { + "epoch": 1.128937728937729, + "grad_norm": 0.6640121837001143, + "learning_rate": 4.16077239216914e-06, + "loss": 0.8233, + "step": 391 + }, + { + "epoch": 1.1318681318681318, + "grad_norm": 0.6910556702389824, + "learning_rate": 4.137032043287841e-06, + "loss": 0.8162, + "step": 392 + }, + { + "epoch": 1.1347985347985348, + "grad_norm": 0.6709419218301458, + "learning_rate": 4.11331172982344e-06, + "loss": 0.7851, + "step": 393 + }, + { + "epoch": 1.1377289377289377, + "grad_norm": 0.6928725094292173, + "learning_rate": 4.089612002487428e-06, + "loss": 0.7892, + "step": 394 + }, + { + "epoch": 1.1406593406593406, + "grad_norm": 0.6562007454380986, + "learning_rate": 4.065933411513349e-06, + "loss": 0.8029, + "step": 395 + }, + { + "epoch": 1.1435897435897435, + "grad_norm": 0.6548393048781599, + "learning_rate": 4.042276506644024e-06, + "loss": 0.8241, + "step": 396 + }, + { + "epoch": 1.1465201465201464, + "grad_norm": 0.6896835546766348, + "learning_rate": 4.018641837118799e-06, + "loss": 0.8635, + "step": 397 + }, + { + "epoch": 1.1494505494505494, + "grad_norm": 0.7120720872162148, + "learning_rate": 3.995029951660777e-06, + "loss": 0.8453, + "step": 398 + }, + { + "epoch": 1.1523809523809523, + "grad_norm": 0.675423559668541, + "learning_rate": 3.971441398464088e-06, + "loss": 0.8502, + "step": 399 + }, + { + "epoch": 1.1553113553113552, + "grad_norm": 0.6666783325846857, + "learning_rate": 3.94787672518116e-06, + "loss": 0.764, + "step": 400 + }, + { + "epoch": 1.1582417582417581, + "grad_norm": 0.6792452894265647, + "learning_rate": 3.924336478910007e-06, + "loss": 0.837, + "step": 401 + }, + { + "epoch": 1.1611721611721613, + "grad_norm": 0.6695779092340519, + "learning_rate": 3.900821206181521e-06, + "loss": 0.8553, + "step": 402 + }, + { + "epoch": 1.1641025641025642, + "grad_norm": 0.6245571473231076, + "learning_rate": 3.8773314529467875e-06, + "loss": 0.8628, + "step": 403 + }, + { + "epoch": 1.167032967032967, + "grad_norm": 0.6270158605449955, + "learning_rate": 3.853867764564409e-06, + "loss": 0.7977, + "step": 404 + }, + { + "epoch": 1.16996336996337, + "grad_norm": 0.6407892922208382, + "learning_rate": 3.830430685787844e-06, + "loss": 0.7771, + "step": 405 + }, + { + "epoch": 1.172893772893773, + "grad_norm": 0.6223015359803294, + "learning_rate": 3.8070207607527587e-06, + "loss": 0.8103, + "step": 406 + }, + { + "epoch": 1.1758241758241759, + "grad_norm": 0.7174693913851876, + "learning_rate": 3.7836385329643933e-06, + "loss": 0.8235, + "step": 407 + }, + { + "epoch": 1.1787545787545788, + "grad_norm": 0.6872440597677548, + "learning_rate": 3.760284545284947e-06, + "loss": 0.8005, + "step": 408 + }, + { + "epoch": 1.1816849816849817, + "grad_norm": 0.653437969039063, + "learning_rate": 3.7369593399209704e-06, + "loss": 0.8539, + "step": 409 + }, + { + "epoch": 1.1846153846153846, + "grad_norm": 0.6473384600035247, + "learning_rate": 3.7136634584107787e-06, + "loss": 0.862, + "step": 410 + }, + { + "epoch": 1.1875457875457875, + "grad_norm": 0.6751113614880868, + "learning_rate": 3.6903974416118786e-06, + "loss": 0.8373, + "step": 411 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 0.717978123237241, + "learning_rate": 3.6671618296884147e-06, + "loss": 0.8654, + "step": 412 + }, + { + "epoch": 1.1934065934065934, + "grad_norm": 0.6512581688639196, + "learning_rate": 3.6439571620986213e-06, + "loss": 0.7892, + "step": 413 + }, + { + "epoch": 1.1963369963369963, + "grad_norm": 0.6777361292192612, + "learning_rate": 3.620783977582305e-06, + "loss": 0.8695, + "step": 414 + }, + { + "epoch": 1.1992673992673992, + "grad_norm": 0.7516797430775108, + "learning_rate": 3.5976428141483326e-06, + "loss": 0.7964, + "step": 415 + }, + { + "epoch": 1.2021978021978021, + "grad_norm": 0.6227714905254692, + "learning_rate": 3.5745342090621406e-06, + "loss": 0.8005, + "step": 416 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.7119874654244905, + "learning_rate": 3.551458698833265e-06, + "loss": 0.8489, + "step": 417 + }, + { + "epoch": 1.208058608058608, + "grad_norm": 0.6742791369516543, + "learning_rate": 3.528416819202881e-06, + "loss": 0.8181, + "step": 418 + }, + { + "epoch": 1.210989010989011, + "grad_norm": 0.6685215351425255, + "learning_rate": 3.5054091051313666e-06, + "loss": 0.8546, + "step": 419 + }, + { + "epoch": 1.2139194139194138, + "grad_norm": 0.6446298190079693, + "learning_rate": 3.4824360907858824e-06, + "loss": 0.8238, + "step": 420 + }, + { + "epoch": 1.216849816849817, + "grad_norm": 0.6752602907676645, + "learning_rate": 3.45949830952797e-06, + "loss": 0.8587, + "step": 421 + }, + { + "epoch": 1.2197802197802199, + "grad_norm": 0.6536486616817935, + "learning_rate": 3.43659629390117e-06, + "loss": 0.8482, + "step": 422 + }, + { + "epoch": 1.2227106227106228, + "grad_norm": 0.6662928213764988, + "learning_rate": 3.4137305756186543e-06, + "loss": 0.8215, + "step": 423 + }, + { + "epoch": 1.2256410256410257, + "grad_norm": 0.7247368921308519, + "learning_rate": 3.390901685550887e-06, + "loss": 0.8503, + "step": 424 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 0.7081609071329151, + "learning_rate": 3.3681101537132914e-06, + "loss": 0.7612, + "step": 425 + }, + { + "epoch": 1.2315018315018316, + "grad_norm": 0.6894290798631326, + "learning_rate": 3.3453565092539586e-06, + "loss": 0.8197, + "step": 426 + }, + { + "epoch": 1.2344322344322345, + "grad_norm": 0.6314938401797467, + "learning_rate": 3.3226412804413444e-06, + "loss": 0.8906, + "step": 427 + }, + { + "epoch": 1.2373626373626374, + "grad_norm": 0.700020304527246, + "learning_rate": 3.299964994652017e-06, + "loss": 0.8305, + "step": 428 + }, + { + "epoch": 1.2402930402930403, + "grad_norm": 0.6672421569303719, + "learning_rate": 3.2773281783584104e-06, + "loss": 0.7996, + "step": 429 + }, + { + "epoch": 1.2432234432234432, + "grad_norm": 0.6523896330990022, + "learning_rate": 3.254731357116597e-06, + "loss": 0.8177, + "step": 430 + }, + { + "epoch": 1.2461538461538462, + "grad_norm": 0.7438977310052812, + "learning_rate": 3.232175055554091e-06, + "loss": 0.7981, + "step": 431 + }, + { + "epoch": 1.249084249084249, + "grad_norm": 0.6616594726095909, + "learning_rate": 3.209659797357669e-06, + "loss": 0.858, + "step": 432 + }, + { + "epoch": 1.252014652014652, + "grad_norm": 0.6357225071174895, + "learning_rate": 3.187186105261205e-06, + "loss": 0.7919, + "step": 433 + }, + { + "epoch": 1.254945054945055, + "grad_norm": 0.6645514669695376, + "learning_rate": 3.16475450103354e-06, + "loss": 0.8445, + "step": 434 + }, + { + "epoch": 1.2578754578754578, + "grad_norm": 0.580411587537406, + "learning_rate": 3.1423655054663665e-06, + "loss": 0.8118, + "step": 435 + }, + { + "epoch": 1.2608058608058608, + "grad_norm": 0.6502200181839426, + "learning_rate": 3.1200196383621363e-06, + "loss": 0.8128, + "step": 436 + }, + { + "epoch": 1.2637362637362637, + "grad_norm": 0.7118079489594058, + "learning_rate": 3.0977174185219937e-06, + "loss": 0.8457, + "step": 437 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.6399422284684246, + "learning_rate": 3.0754593637337276e-06, + "loss": 0.8285, + "step": 438 + }, + { + "epoch": 1.2695970695970695, + "grad_norm": 0.6476474872969088, + "learning_rate": 3.05324599075976e-06, + "loss": 0.8328, + "step": 439 + }, + { + "epoch": 1.2725274725274724, + "grad_norm": 0.6527054020963707, + "learning_rate": 3.0310778153251325e-06, + "loss": 0.8183, + "step": 440 + }, + { + "epoch": 1.2754578754578754, + "grad_norm": 0.6892021510002114, + "learning_rate": 3.0089553521055455e-06, + "loss": 0.8117, + "step": 441 + }, + { + "epoch": 1.2783882783882783, + "grad_norm": 0.6938622596098429, + "learning_rate": 2.986879114715403e-06, + "loss": 0.7997, + "step": 442 + }, + { + "epoch": 1.2813186813186812, + "grad_norm": 0.6760167477421419, + "learning_rate": 2.964849615695892e-06, + "loss": 0.78, + "step": 443 + }, + { + "epoch": 1.2842490842490841, + "grad_norm": 0.6630159721868387, + "learning_rate": 2.9428673665030772e-06, + "loss": 0.7888, + "step": 444 + }, + { + "epoch": 1.287179487179487, + "grad_norm": 0.6422717314512306, + "learning_rate": 2.9209328774960327e-06, + "loss": 0.7959, + "step": 445 + }, + { + "epoch": 1.2901098901098902, + "grad_norm": 0.6049124816690329, + "learning_rate": 2.899046657924992e-06, + "loss": 0.8261, + "step": 446 + }, + { + "epoch": 1.293040293040293, + "grad_norm": 0.6538410790869666, + "learning_rate": 2.8772092159195193e-06, + "loss": 0.7815, + "step": 447 + }, + { + "epoch": 1.295970695970696, + "grad_norm": 0.6674785244689053, + "learning_rate": 2.8554210584767188e-06, + "loss": 0.844, + "step": 448 + }, + { + "epoch": 1.298901098901099, + "grad_norm": 0.634998782888407, + "learning_rate": 2.8336826914494607e-06, + "loss": 0.8069, + "step": 449 + }, + { + "epoch": 1.3018315018315019, + "grad_norm": 0.6837741896832135, + "learning_rate": 2.8119946195346375e-06, + "loss": 0.8087, + "step": 450 + }, + { + "epoch": 1.3047619047619048, + "grad_norm": 0.622038243341648, + "learning_rate": 2.790357346261445e-06, + "loss": 0.8064, + "step": 451 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.632121822028228, + "learning_rate": 2.7687713739796972e-06, + "loss": 0.8385, + "step": 452 + }, + { + "epoch": 1.3106227106227106, + "grad_norm": 0.660621163031169, + "learning_rate": 2.7472372038481575e-06, + "loss": 0.8076, + "step": 453 + }, + { + "epoch": 1.3135531135531135, + "grad_norm": 0.6456025313112773, + "learning_rate": 2.725755335822903e-06, + "loss": 0.8417, + "step": 454 + }, + { + "epoch": 1.3164835164835165, + "grad_norm": 0.6339639780226668, + "learning_rate": 2.704326268645724e-06, + "loss": 0.7822, + "step": 455 + }, + { + "epoch": 1.3194139194139194, + "grad_norm": 0.679968992896686, + "learning_rate": 2.6829504998325352e-06, + "loss": 0.8068, + "step": 456 + }, + { + "epoch": 1.3223443223443223, + "grad_norm": 0.6865250171224553, + "learning_rate": 2.661628525661835e-06, + "loss": 0.8148, + "step": 457 + }, + { + "epoch": 1.3252747252747252, + "grad_norm": 0.6684828897934624, + "learning_rate": 2.6403608411631744e-06, + "loss": 0.8542, + "step": 458 + }, + { + "epoch": 1.3282051282051281, + "grad_norm": 0.6758098656078194, + "learning_rate": 2.61914794010567e-06, + "loss": 0.7891, + "step": 459 + }, + { + "epoch": 1.331135531135531, + "grad_norm": 0.6302841257314736, + "learning_rate": 2.5979903149865386e-06, + "loss": 0.8028, + "step": 460 + }, + { + "epoch": 1.334065934065934, + "grad_norm": 0.6195066900214882, + "learning_rate": 2.5768884570196615e-06, + "loss": 0.8105, + "step": 461 + }, + { + "epoch": 1.3369963369963371, + "grad_norm": 0.6527285262735081, + "learning_rate": 2.555842856124182e-06, + "loss": 0.8416, + "step": 462 + }, + { + "epoch": 1.33992673992674, + "grad_norm": 0.6844711291773308, + "learning_rate": 2.5348540009131283e-06, + "loss": 0.8731, + "step": 463 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.6917219880994996, + "learning_rate": 2.513922378682075e-06, + "loss": 0.8617, + "step": 464 + }, + { + "epoch": 1.345787545787546, + "grad_norm": 0.7505357241697591, + "learning_rate": 2.4930484753978224e-06, + "loss": 0.8734, + "step": 465 + }, + { + "epoch": 1.3487179487179488, + "grad_norm": 0.6280358853585012, + "learning_rate": 2.472232775687119e-06, + "loss": 0.8073, + "step": 466 + }, + { + "epoch": 1.3516483516483517, + "grad_norm": 0.6357985057852017, + "learning_rate": 2.4514757628254075e-06, + "loss": 0.8643, + "step": 467 + }, + { + "epoch": 1.3545787545787547, + "grad_norm": 0.618985879452095, + "learning_rate": 2.4307779187256064e-06, + "loss": 0.8588, + "step": 468 + }, + { + "epoch": 1.3575091575091576, + "grad_norm": 0.6816901320558723, + "learning_rate": 2.4101397239269202e-06, + "loss": 0.8974, + "step": 469 + }, + { + "epoch": 1.3604395604395605, + "grad_norm": 0.6569693608830611, + "learning_rate": 2.389561657583681e-06, + "loss": 0.847, + "step": 470 + }, + { + "epoch": 1.3633699633699634, + "grad_norm": 0.6722649284724962, + "learning_rate": 2.3690441974542322e-06, + "loss": 0.8382, + "step": 471 + }, + { + "epoch": 1.3663003663003663, + "grad_norm": 0.6738703659901211, + "learning_rate": 2.3485878198898253e-06, + "loss": 0.8096, + "step": 472 + }, + { + "epoch": 1.3692307692307693, + "grad_norm": 0.6199220534522856, + "learning_rate": 2.3281929998235664e-06, + "loss": 0.8546, + "step": 473 + }, + { + "epoch": 1.3721611721611722, + "grad_norm": 0.637774103707959, + "learning_rate": 2.30786021075939e-06, + "loss": 0.8496, + "step": 474 + }, + { + "epoch": 1.375091575091575, + "grad_norm": 0.6745414429909864, + "learning_rate": 2.287589924761065e-06, + "loss": 0.8613, + "step": 475 + }, + { + "epoch": 1.378021978021978, + "grad_norm": 0.6720281480914894, + "learning_rate": 2.2673826124412314e-06, + "loss": 0.7859, + "step": 476 + }, + { + "epoch": 1.380952380952381, + "grad_norm": 0.7515360163059726, + "learning_rate": 2.247238742950483e-06, + "loss": 0.8675, + "step": 477 + }, + { + "epoch": 1.3838827838827839, + "grad_norm": 0.6457379923431958, + "learning_rate": 2.2271587839664673e-06, + "loss": 0.7631, + "step": 478 + }, + { + "epoch": 1.3868131868131868, + "grad_norm": 0.6854082468341819, + "learning_rate": 2.2071432016830257e-06, + "loss": 0.8032, + "step": 479 + }, + { + "epoch": 1.3897435897435897, + "grad_norm": 0.6599311122434474, + "learning_rate": 2.18719246079938e-06, + "loss": 0.8608, + "step": 480 + }, + { + "epoch": 1.3926739926739926, + "grad_norm": 0.6303175902931171, + "learning_rate": 2.1673070245093324e-06, + "loss": 0.7762, + "step": 481 + }, + { + "epoch": 1.3956043956043955, + "grad_norm": 0.6570693148213644, + "learning_rate": 2.1474873544905204e-06, + "loss": 0.8644, + "step": 482 + }, + { + "epoch": 1.3985347985347985, + "grad_norm": 0.6856939265944957, + "learning_rate": 2.127733910893693e-06, + "loss": 0.8534, + "step": 483 + }, + { + "epoch": 1.4014652014652014, + "grad_norm": 0.6239443590408076, + "learning_rate": 2.108047152332028e-06, + "loss": 0.8024, + "step": 484 + }, + { + "epoch": 1.4043956043956043, + "grad_norm": 0.7125717109839642, + "learning_rate": 2.0884275358704863e-06, + "loss": 0.8313, + "step": 485 + }, + { + "epoch": 1.4073260073260072, + "grad_norm": 0.6724094211474902, + "learning_rate": 2.0688755170152e-06, + "loss": 0.8008, + "step": 486 + }, + { + "epoch": 1.4102564102564101, + "grad_norm": 0.6349895866455858, + "learning_rate": 2.0493915497028933e-06, + "loss": 0.7776, + "step": 487 + }, + { + "epoch": 1.413186813186813, + "grad_norm": 0.638362138133294, + "learning_rate": 2.029976086290347e-06, + "loss": 0.833, + "step": 488 + }, + { + "epoch": 1.4161172161172162, + "grad_norm": 0.654564338267121, + "learning_rate": 2.0106295775439018e-06, + "loss": 0.8209, + "step": 489 + }, + { + "epoch": 1.4190476190476191, + "grad_norm": 0.6357815141984277, + "learning_rate": 1.9913524726289784e-06, + "loss": 0.8488, + "step": 490 + }, + { + "epoch": 1.421978021978022, + "grad_norm": 0.684936203470749, + "learning_rate": 1.972145219099662e-06, + "loss": 0.8553, + "step": 491 + }, + { + "epoch": 1.424908424908425, + "grad_norm": 0.7598638244867787, + "learning_rate": 1.9530082628883058e-06, + "loss": 0.792, + "step": 492 + }, + { + "epoch": 1.4278388278388279, + "grad_norm": 0.6233735788941451, + "learning_rate": 1.9339420482951794e-06, + "loss": 0.8084, + "step": 493 + }, + { + "epoch": 1.4307692307692308, + "grad_norm": 0.7179491144851546, + "learning_rate": 1.914947017978153e-06, + "loss": 0.8926, + "step": 494 + }, + { + "epoch": 1.4336996336996337, + "grad_norm": 0.6555466049636046, + "learning_rate": 1.896023612942422e-06, + "loss": 0.8343, + "step": 495 + }, + { + "epoch": 1.4366300366300366, + "grad_norm": 0.645826989084998, + "learning_rate": 1.8771722725302644e-06, + "loss": 0.7947, + "step": 496 + }, + { + "epoch": 1.4395604395604396, + "grad_norm": 0.6482726366595238, + "learning_rate": 1.8583934344108446e-06, + "loss": 0.7992, + "step": 497 + }, + { + "epoch": 1.4424908424908425, + "grad_norm": 0.6376596791410226, + "learning_rate": 1.8396875345700498e-06, + "loss": 0.8553, + "step": 498 + }, + { + "epoch": 1.4454212454212454, + "grad_norm": 0.6903116633272584, + "learning_rate": 1.8210550073003701e-06, + "loss": 0.8495, + "step": 499 + }, + { + "epoch": 1.4483516483516483, + "grad_norm": 0.64069962019644, + "learning_rate": 1.8024962851908106e-06, + "loss": 0.8217, + "step": 500 + }, + { + "epoch": 1.4512820512820512, + "grad_norm": 0.6558252227810359, + "learning_rate": 1.7840117991168566e-06, + "loss": 0.8505, + "step": 501 + }, + { + "epoch": 1.4542124542124542, + "grad_norm": 0.7079418419069864, + "learning_rate": 1.7656019782304602e-06, + "loss": 0.8472, + "step": 502 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 0.705103453747584, + "learning_rate": 1.7472672499500837e-06, + "loss": 0.8183, + "step": 503 + }, + { + "epoch": 1.46007326007326, + "grad_norm": 0.6971164190084713, + "learning_rate": 1.729008039950772e-06, + "loss": 0.8294, + "step": 504 + }, + { + "epoch": 1.4630036630036631, + "grad_norm": 0.6219980479611457, + "learning_rate": 1.7108247721542725e-06, + "loss": 0.8172, + "step": 505 + }, + { + "epoch": 1.465934065934066, + "grad_norm": 0.6616829143182322, + "learning_rate": 1.6927178687191953e-06, + "loss": 0.8351, + "step": 506 + }, + { + "epoch": 1.468864468864469, + "grad_norm": 0.6102810407037182, + "learning_rate": 1.6746877500312054e-06, + "loss": 0.8267, + "step": 507 + }, + { + "epoch": 1.471794871794872, + "grad_norm": 0.6186344377928403, + "learning_rate": 1.656734834693266e-06, + "loss": 0.8209, + "step": 508 + }, + { + "epoch": 1.4747252747252748, + "grad_norm": 0.6028243081632925, + "learning_rate": 1.6388595395159207e-06, + "loss": 0.8012, + "step": 509 + }, + { + "epoch": 1.4776556776556777, + "grad_norm": 0.6358843664209652, + "learning_rate": 1.621062279507617e-06, + "loss": 0.8406, + "step": 510 + }, + { + "epoch": 1.4805860805860807, + "grad_norm": 0.6192894548917643, + "learning_rate": 1.6033434678650684e-06, + "loss": 0.8013, + "step": 511 + }, + { + "epoch": 1.4835164835164836, + "grad_norm": 0.6578313717018371, + "learning_rate": 1.5857035159636625e-06, + "loss": 0.7908, + "step": 512 + }, + { + "epoch": 1.4864468864468865, + "grad_norm": 0.6865547442963875, + "learning_rate": 1.5681428333479104e-06, + "loss": 0.8147, + "step": 513 + }, + { + "epoch": 1.4893772893772894, + "grad_norm": 0.6391710767212113, + "learning_rate": 1.550661827721941e-06, + "loss": 0.8002, + "step": 514 + }, + { + "epoch": 1.4923076923076923, + "grad_norm": 0.6697114193966748, + "learning_rate": 1.5332609049400298e-06, + "loss": 0.8738, + "step": 515 + }, + { + "epoch": 1.4952380952380953, + "grad_norm": 0.6484578925753415, + "learning_rate": 1.5159404689971797e-06, + "loss": 0.8248, + "step": 516 + }, + { + "epoch": 1.4981684981684982, + "grad_norm": 0.6727152397518047, + "learning_rate": 1.4987009220197406e-06, + "loss": 0.8182, + "step": 517 + }, + { + "epoch": 1.501098901098901, + "grad_norm": 0.6222439914425902, + "learning_rate": 1.4815426642560753e-06, + "loss": 0.7844, + "step": 518 + }, + { + "epoch": 1.504029304029304, + "grad_norm": 0.5928698513150097, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.827, + "step": 519 + }, + { + "epoch": 1.506959706959707, + "grad_norm": 0.628709550563957, + "learning_rate": 1.4474716079178541e-06, + "loss": 0.8124, + "step": 520 + }, + { + "epoch": 1.5098901098901099, + "grad_norm": 0.6926451262517697, + "learning_rate": 1.430559600366665e-06, + "loss": 0.801, + "step": 521 + }, + { + "epoch": 1.5128205128205128, + "grad_norm": 0.6320651537323465, + "learning_rate": 1.4137304640576161e-06, + "loss": 0.7778, + "step": 522 + }, + { + "epoch": 1.5157509157509157, + "grad_norm": 0.6302945508979699, + "learning_rate": 1.3969845897106166e-06, + "loss": 0.8078, + "step": 523 + }, + { + "epoch": 1.5186813186813186, + "grad_norm": 0.6487308175519874, + "learning_rate": 1.3803223661124938e-06, + "loss": 0.8335, + "step": 524 + }, + { + "epoch": 1.5216117216117215, + "grad_norm": 0.5828653402715643, + "learning_rate": 1.3637441801079666e-06, + "loss": 0.7563, + "step": 525 + }, + { + "epoch": 1.5245421245421245, + "grad_norm": 0.6433886325941865, + "learning_rate": 1.3472504165906614e-06, + "loss": 0.8079, + "step": 526 + }, + { + "epoch": 1.5274725274725274, + "grad_norm": 0.7453574785666953, + "learning_rate": 1.330841458494183e-06, + "loss": 0.8451, + "step": 527 + }, + { + "epoch": 1.5304029304029303, + "grad_norm": 0.6816287097521287, + "learning_rate": 1.3145176867832165e-06, + "loss": 0.8464, + "step": 528 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.6036938583818248, + "learning_rate": 1.2982794804446858e-06, + "loss": 0.7685, + "step": 529 + }, + { + "epoch": 1.5362637362637361, + "grad_norm": 0.6463872641656683, + "learning_rate": 1.2821272164789544e-06, + "loss": 0.8205, + "step": 530 + }, + { + "epoch": 1.539194139194139, + "grad_norm": 0.6230426878687141, + "learning_rate": 1.266061269891073e-06, + "loss": 0.8272, + "step": 531 + }, + { + "epoch": 1.542124542124542, + "grad_norm": 0.6178970771612478, + "learning_rate": 1.2500820136820735e-06, + "loss": 0.7795, + "step": 532 + }, + { + "epoch": 1.545054945054945, + "grad_norm": 0.6002590996506842, + "learning_rate": 1.2341898188403068e-06, + "loss": 0.7897, + "step": 533 + }, + { + "epoch": 1.547985347985348, + "grad_norm": 0.6434409409742198, + "learning_rate": 1.2183850543328313e-06, + "loss": 0.7969, + "step": 534 + }, + { + "epoch": 1.550915750915751, + "grad_norm": 0.6417718520851279, + "learning_rate": 1.2026680870968494e-06, + "loss": 0.812, + "step": 535 + }, + { + "epoch": 1.5538461538461539, + "grad_norm": 0.6425763487232372, + "learning_rate": 1.187039282031182e-06, + "loss": 0.8454, + "step": 536 + }, + { + "epoch": 1.5567765567765568, + "grad_norm": 0.6049459246679286, + "learning_rate": 1.171499001987802e-06, + "loss": 0.805, + "step": 537 + }, + { + "epoch": 1.5597069597069597, + "grad_norm": 0.6857584110796993, + "learning_rate": 1.156047607763407e-06, + "loss": 0.8451, + "step": 538 + }, + { + "epoch": 1.5626373626373626, + "grad_norm": 0.6802912313005144, + "learning_rate": 1.1406854580910426e-06, + "loss": 0.8573, + "step": 539 + }, + { + "epoch": 1.5655677655677656, + "grad_norm": 0.6283637449982267, + "learning_rate": 1.1254129096317807e-06, + "loss": 0.8468, + "step": 540 + }, + { + "epoch": 1.5684981684981685, + "grad_norm": 0.6999745340643583, + "learning_rate": 1.110230316966427e-06, + "loss": 0.8222, + "step": 541 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.6683547293175401, + "learning_rate": 1.095138032587298e-06, + "loss": 0.8226, + "step": 542 + }, + { + "epoch": 1.5743589743589743, + "grad_norm": 0.6395602646701126, + "learning_rate": 1.0801364068900334e-06, + "loss": 0.7993, + "step": 543 + }, + { + "epoch": 1.5772893772893772, + "grad_norm": 0.666258880747308, + "learning_rate": 1.0652257881654625e-06, + "loss": 0.8272, + "step": 544 + }, + { + "epoch": 1.5802197802197804, + "grad_norm": 0.6081578536350295, + "learning_rate": 1.0504065225915166e-06, + "loss": 0.7984, + "step": 545 + }, + { + "epoch": 1.5831501831501833, + "grad_norm": 0.6964583691213134, + "learning_rate": 1.0356789542251939e-06, + "loss": 0.834, + "step": 546 + }, + { + "epoch": 1.5860805860805862, + "grad_norm": 0.6364703392936907, + "learning_rate": 1.0210434249945677e-06, + "loss": 0.8448, + "step": 547 + }, + { + "epoch": 1.5890109890109891, + "grad_norm": 0.6616419125303836, + "learning_rate": 1.0065002746908532e-06, + "loss": 0.7907, + "step": 548 + }, + { + "epoch": 1.591941391941392, + "grad_norm": 0.6089769062029402, + "learning_rate": 9.92049840960514e-07, + "loss": 0.8168, + "step": 549 + }, + { + "epoch": 1.594871794871795, + "grad_norm": 0.6496974721223905, + "learning_rate": 9.776924592974257e-07, + "loss": 0.7637, + "step": 550 + }, + { + "epoch": 1.597802197802198, + "grad_norm": 0.59655757686596, + "learning_rate": 9.63428463035085e-07, + "loss": 0.7466, + "step": 551 + }, + { + "epoch": 1.6007326007326008, + "grad_norm": 0.6797373377804693, + "learning_rate": 9.492581833388736e-07, + "loss": 0.8493, + "step": 552 + }, + { + "epoch": 1.6036630036630037, + "grad_norm": 0.6540093484659664, + "learning_rate": 9.351819491983671e-07, + "loss": 0.8414, + "step": 553 + }, + { + "epoch": 1.6065934065934067, + "grad_norm": 0.6192544935560744, + "learning_rate": 9.212000874196953e-07, + "loss": 0.7949, + "step": 554 + }, + { + "epoch": 1.6095238095238096, + "grad_norm": 0.6413798335452731, + "learning_rate": 9.073129226179589e-07, + "loss": 0.8367, + "step": 555 + }, + { + "epoch": 1.6124542124542125, + "grad_norm": 0.6793141292931023, + "learning_rate": 8.935207772096904e-07, + "loss": 0.8514, + "step": 556 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.6134133075817998, + "learning_rate": 8.798239714053692e-07, + "loss": 0.7971, + "step": 557 + }, + { + "epoch": 1.6183150183150183, + "grad_norm": 0.6483216170375673, + "learning_rate": 8.662228232019876e-07, + "loss": 0.805, + "step": 558 + }, + { + "epoch": 1.6212454212454213, + "grad_norm": 0.6121384170993905, + "learning_rate": 8.527176483756671e-07, + "loss": 0.8169, + "step": 559 + }, + { + "epoch": 1.6241758241758242, + "grad_norm": 0.6068175290937734, + "learning_rate": 8.393087604743283e-07, + "loss": 0.789, + "step": 560 + }, + { + "epoch": 1.627106227106227, + "grad_norm": 0.669787004717848, + "learning_rate": 8.259964708104091e-07, + "loss": 0.8504, + "step": 561 + }, + { + "epoch": 1.63003663003663, + "grad_norm": 0.6283516029134004, + "learning_rate": 8.127810884536402e-07, + "loss": 0.7839, + "step": 562 + }, + { + "epoch": 1.632967032967033, + "grad_norm": 0.6356496138544873, + "learning_rate": 7.996629202238654e-07, + "loss": 0.8213, + "step": 563 + }, + { + "epoch": 1.6358974358974359, + "grad_norm": 0.6409340166886885, + "learning_rate": 7.866422706839239e-07, + "loss": 0.8164, + "step": 564 + }, + { + "epoch": 1.6388278388278388, + "grad_norm": 0.6102325495945577, + "learning_rate": 7.737194421325739e-07, + "loss": 0.8133, + "step": 565 + }, + { + "epoch": 1.6417582417582417, + "grad_norm": 0.6263579578543954, + "learning_rate": 7.60894734597476e-07, + "loss": 0.8469, + "step": 566 + }, + { + "epoch": 1.6446886446886446, + "grad_norm": 0.6110697107228826, + "learning_rate": 7.481684458282273e-07, + "loss": 0.78, + "step": 567 + }, + { + "epoch": 1.6476190476190475, + "grad_norm": 0.623376405697899, + "learning_rate": 7.355408712894508e-07, + "loss": 0.7951, + "step": 568 + }, + { + "epoch": 1.6505494505494505, + "grad_norm": 0.6305238108003742, + "learning_rate": 7.23012304153931e-07, + "loss": 0.8202, + "step": 569 + }, + { + "epoch": 1.6534798534798534, + "grad_norm": 0.6780370401869151, + "learning_rate": 7.105830352958143e-07, + "loss": 0.8482, + "step": 570 + }, + { + "epoch": 1.6564102564102563, + "grad_norm": 0.618188299040753, + "learning_rate": 6.982533532838481e-07, + "loss": 0.8142, + "step": 571 + }, + { + "epoch": 1.6593406593406592, + "grad_norm": 0.6162603773878373, + "learning_rate": 6.86023544374686e-07, + "loss": 0.8058, + "step": 572 + }, + { + "epoch": 1.6622710622710621, + "grad_norm": 0.5988741632305926, + "learning_rate": 6.738938925062399e-07, + "loss": 0.7909, + "step": 573 + }, + { + "epoch": 1.665201465201465, + "grad_norm": 0.6352751552587808, + "learning_rate": 6.618646792910893e-07, + "loss": 0.8238, + "step": 574 + }, + { + "epoch": 1.668131868131868, + "grad_norm": 0.653372698236446, + "learning_rate": 6.499361840099421e-07, + "loss": 0.8317, + "step": 575 + }, + { + "epoch": 1.671062271062271, + "grad_norm": 0.6882980816759717, + "learning_rate": 6.381086836051498e-07, + "loss": 0.8632, + "step": 576 + }, + { + "epoch": 1.673992673992674, + "grad_norm": 0.7343311172190679, + "learning_rate": 6.263824526742807e-07, + "loss": 0.7804, + "step": 577 + }, + { + "epoch": 1.676923076923077, + "grad_norm": 0.6758598163365385, + "learning_rate": 6.147577634637413e-07, + "loss": 0.8298, + "step": 578 + }, + { + "epoch": 1.6798534798534799, + "grad_norm": 0.6132872813883782, + "learning_rate": 6.03234885862457e-07, + "loss": 0.8346, + "step": 579 + }, + { + "epoch": 1.6827838827838828, + "grad_norm": 0.6370171751728234, + "learning_rate": 5.918140873956063e-07, + "loss": 0.8204, + "step": 580 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.6033052646359772, + "learning_rate": 5.804956332184092e-07, + "loss": 0.8057, + "step": 581 + }, + { + "epoch": 1.6886446886446886, + "grad_norm": 0.608486749709341, + "learning_rate": 5.692797861099719e-07, + "loss": 0.802, + "step": 582 + }, + { + "epoch": 1.6915750915750916, + "grad_norm": 0.6205330824144688, + "learning_rate": 5.581668064671847e-07, + "loss": 0.788, + "step": 583 + }, + { + "epoch": 1.6945054945054945, + "grad_norm": 0.6139277202253846, + "learning_rate": 5.471569522986775e-07, + "loss": 0.808, + "step": 584 + }, + { + "epoch": 1.6974358974358974, + "grad_norm": 0.6107771472698981, + "learning_rate": 5.3625047921883e-07, + "loss": 0.7571, + "step": 585 + }, + { + "epoch": 1.7003663003663003, + "grad_norm": 0.63711314046744, + "learning_rate": 5.254476404418341e-07, + "loss": 0.8345, + "step": 586 + }, + { + "epoch": 1.7032967032967035, + "grad_norm": 0.6420503886385798, + "learning_rate": 5.147486867758201e-07, + "loss": 0.7937, + "step": 587 + }, + { + "epoch": 1.7062271062271064, + "grad_norm": 0.6623827998358632, + "learning_rate": 5.041538666170282e-07, + "loss": 0.8063, + "step": 588 + }, + { + "epoch": 1.7091575091575093, + "grad_norm": 0.6311887431693253, + "learning_rate": 4.93663425944047e-07, + "loss": 0.813, + "step": 589 + }, + { + "epoch": 1.7120879120879122, + "grad_norm": 0.6894109166940522, + "learning_rate": 4.832776083120983e-07, + "loss": 0.8284, + "step": 590 + }, + { + "epoch": 1.7150183150183151, + "grad_norm": 0.6344417770301126, + "learning_rate": 4.729966548473841e-07, + "loss": 0.8173, + "step": 591 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 0.6423843551998889, + "learning_rate": 4.628208042414889e-07, + "loss": 0.7984, + "step": 592 + }, + { + "epoch": 1.720879120879121, + "grad_norm": 0.6306008472792638, + "learning_rate": 4.5275029274583715e-07, + "loss": 0.7507, + "step": 593 + }, + { + "epoch": 1.723809523809524, + "grad_norm": 0.6593797867797567, + "learning_rate": 4.4278535416620914e-07, + "loss": 0.828, + "step": 594 + }, + { + "epoch": 1.7267399267399268, + "grad_norm": 0.6859435430823481, + "learning_rate": 4.329262198573109e-07, + "loss": 0.845, + "step": 595 + }, + { + "epoch": 1.7296703296703297, + "grad_norm": 0.668055176901726, + "learning_rate": 4.231731187174065e-07, + "loss": 0.8841, + "step": 596 + }, + { + "epoch": 1.7326007326007327, + "grad_norm": 0.65219754190525, + "learning_rate": 4.1352627718299953e-07, + "loss": 0.7924, + "step": 597 + }, + { + "epoch": 1.7355311355311356, + "grad_norm": 0.6187950860853839, + "learning_rate": 4.0398591922357787e-07, + "loss": 0.7908, + "step": 598 + }, + { + "epoch": 1.7384615384615385, + "grad_norm": 0.6126484385304161, + "learning_rate": 3.945522663364154e-07, + "loss": 0.817, + "step": 599 + }, + { + "epoch": 1.7413919413919414, + "grad_norm": 0.6954213656596391, + "learning_rate": 3.852255375414271e-07, + "loss": 0.8631, + "step": 600 + }, + { + "epoch": 1.7443223443223443, + "grad_norm": 0.5983672679179807, + "learning_rate": 3.760059493760848e-07, + "loss": 0.8439, + "step": 601 + }, + { + "epoch": 1.7472527472527473, + "grad_norm": 0.6702815534109421, + "learning_rate": 3.6689371589039013e-07, + "loss": 0.81, + "step": 602 + }, + { + "epoch": 1.7501831501831502, + "grad_norm": 0.6558071370818314, + "learning_rate": 3.5788904864190346e-07, + "loss": 0.844, + "step": 603 + }, + { + "epoch": 1.753113553113553, + "grad_norm": 0.5830796594516852, + "learning_rate": 3.489921566908372e-07, + "loss": 0.7941, + "step": 604 + }, + { + "epoch": 1.756043956043956, + "grad_norm": 0.5804982345948221, + "learning_rate": 3.4020324659519487e-07, + "loss": 0.8018, + "step": 605 + }, + { + "epoch": 1.758974358974359, + "grad_norm": 0.6178976275532727, + "learning_rate": 3.315225224059809e-07, + "loss": 0.788, + "step": 606 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 0.6341256961341182, + "learning_rate": 3.2295018566246007e-07, + "loss": 0.7849, + "step": 607 + }, + { + "epoch": 1.7648351648351648, + "grad_norm": 0.6068231207313219, + "learning_rate": 3.1448643538748045e-07, + "loss": 0.802, + "step": 608 + }, + { + "epoch": 1.7677655677655677, + "grad_norm": 0.6249082935288197, + "learning_rate": 3.06131468082852e-07, + "loss": 0.8624, + "step": 609 + }, + { + "epoch": 1.7706959706959706, + "grad_norm": 0.6609124694758736, + "learning_rate": 2.9788547772478416e-07, + "loss": 0.8283, + "step": 610 + }, + { + "epoch": 1.7736263736263735, + "grad_norm": 0.6404899031591438, + "learning_rate": 2.897486557593826e-07, + "loss": 0.8682, + "step": 611 + }, + { + "epoch": 1.7765567765567765, + "grad_norm": 0.6406019381281695, + "learning_rate": 2.817211910982037e-07, + "loss": 0.7961, + "step": 612 + }, + { + "epoch": 1.7794871794871794, + "grad_norm": 0.6050989783512516, + "learning_rate": 2.7380327011387096e-07, + "loss": 0.7676, + "step": 613 + }, + { + "epoch": 1.7824175824175823, + "grad_norm": 0.6309104795298323, + "learning_rate": 2.6599507663574387e-07, + "loss": 0.8164, + "step": 614 + }, + { + "epoch": 1.7853479853479852, + "grad_norm": 0.6893839353883793, + "learning_rate": 2.582967919456547e-07, + "loss": 0.8334, + "step": 615 + }, + { + "epoch": 1.7882783882783881, + "grad_norm": 0.6679193801960764, + "learning_rate": 2.5070859477369645e-07, + "loss": 0.8355, + "step": 616 + }, + { + "epoch": 1.791208791208791, + "grad_norm": 0.597814457069523, + "learning_rate": 2.432306612940738e-07, + "loss": 0.7758, + "step": 617 + }, + { + "epoch": 1.794139194139194, + "grad_norm": 0.673036845863369, + "learning_rate": 2.3586316512101416e-07, + "loss": 0.815, + "step": 618 + }, + { + "epoch": 1.7970695970695971, + "grad_norm": 0.6829081733608965, + "learning_rate": 2.286062773047354e-07, + "loss": 0.7963, + "step": 619 + }, + { + "epoch": 1.8, + "grad_norm": 0.6663649626280688, + "learning_rate": 2.2146016632747624e-07, + "loss": 0.7936, + "step": 620 + }, + { + "epoch": 1.802930402930403, + "grad_norm": 0.6168148435228321, + "learning_rate": 2.1442499809958273e-07, + "loss": 0.8128, + "step": 621 + }, + { + "epoch": 1.8058608058608059, + "grad_norm": 0.6628611198238746, + "learning_rate": 2.0750093595565735e-07, + "loss": 0.8198, + "step": 622 + }, + { + "epoch": 1.8087912087912088, + "grad_norm": 0.6581682221674444, + "learning_rate": 2.006881406507677e-07, + "loss": 0.8204, + "step": 623 + }, + { + "epoch": 1.8117216117216117, + "grad_norm": 0.6698411962701205, + "learning_rate": 1.9398677035671222e-07, + "loss": 0.8394, + "step": 624 + }, + { + "epoch": 1.8146520146520146, + "grad_norm": 0.60519664432508, + "learning_rate": 1.8739698065834977e-07, + "loss": 0.819, + "step": 625 + }, + { + "epoch": 1.8175824175824176, + "grad_norm": 0.6606157797404275, + "learning_rate": 1.8091892454998595e-07, + "loss": 0.7976, + "step": 626 + }, + { + "epoch": 1.8205128205128205, + "grad_norm": 0.599099515272182, + "learning_rate": 1.7455275243182413e-07, + "loss": 0.7949, + "step": 627 + }, + { + "epoch": 1.8234432234432234, + "grad_norm": 0.6403542722056276, + "learning_rate": 1.682986121064689e-07, + "loss": 0.8309, + "step": 628 + }, + { + "epoch": 1.8263736263736263, + "grad_norm": 0.625773880009336, + "learning_rate": 1.6215664877549774e-07, + "loss": 0.8118, + "step": 629 + }, + { + "epoch": 1.8293040293040295, + "grad_norm": 0.6008354758535722, + "learning_rate": 1.561270050360897e-07, + "loss": 0.7888, + "step": 630 + }, + { + "epoch": 1.8322344322344324, + "grad_norm": 0.6697858393941617, + "learning_rate": 1.5020982087771362e-07, + "loss": 0.8638, + "step": 631 + }, + { + "epoch": 1.8351648351648353, + "grad_norm": 0.6349708657677041, + "learning_rate": 1.444052336788787e-07, + "loss": 0.8366, + "step": 632 + }, + { + "epoch": 1.8380952380952382, + "grad_norm": 0.654164591563027, + "learning_rate": 1.3871337820394404e-07, + "loss": 0.8657, + "step": 633 + }, + { + "epoch": 1.8410256410256411, + "grad_norm": 0.6797327121552983, + "learning_rate": 1.33134386599994e-07, + "loss": 0.7898, + "step": 634 + }, + { + "epoch": 1.843956043956044, + "grad_norm": 0.646590540827072, + "learning_rate": 1.2766838839376294e-07, + "loss": 0.7954, + "step": 635 + }, + { + "epoch": 1.846886446886447, + "grad_norm": 0.6614293793168924, + "learning_rate": 1.2231551048863421e-07, + "loss": 0.8019, + "step": 636 + }, + { + "epoch": 1.84981684981685, + "grad_norm": 0.5837576704664322, + "learning_rate": 1.1707587716169033e-07, + "loss": 0.7905, + "step": 637 + }, + { + "epoch": 1.8527472527472528, + "grad_norm": 0.613725137243765, + "learning_rate": 1.1194961006082972e-07, + "loss": 0.8264, + "step": 638 + }, + { + "epoch": 1.8556776556776557, + "grad_norm": 0.5981158519945132, + "learning_rate": 1.0693682820194062e-07, + "loss": 0.8222, + "step": 639 + }, + { + "epoch": 1.8586080586080587, + "grad_norm": 0.5939810574693122, + "learning_rate": 1.0203764796614057e-07, + "loss": 0.7919, + "step": 640 + }, + { + "epoch": 1.8615384615384616, + "grad_norm": 0.6380868259972026, + "learning_rate": 9.725218309707129e-08, + "loss": 0.7641, + "step": 641 + }, + { + "epoch": 1.8644688644688645, + "grad_norm": 0.6206777770699217, + "learning_rate": 9.258054469825972e-08, + "loss": 0.8119, + "step": 642 + }, + { + "epoch": 1.8673992673992674, + "grad_norm": 0.6299527540389706, + "learning_rate": 8.802284123053894e-08, + "loss": 0.8057, + "step": 643 + }, + { + "epoch": 1.8703296703296703, + "grad_norm": 0.608418876698341, + "learning_rate": 8.357917850952802e-08, + "loss": 0.8132, + "step": 644 + }, + { + "epoch": 1.8732600732600733, + "grad_norm": 0.621128299966682, + "learning_rate": 7.924965970317722e-08, + "loss": 0.8163, + "step": 645 + }, + { + "epoch": 1.8761904761904762, + "grad_norm": 0.655957461443404, + "learning_rate": 7.503438532937169e-08, + "loss": 0.8134, + "step": 646 + }, + { + "epoch": 1.879120879120879, + "grad_norm": 0.6306340219082865, + "learning_rate": 7.093345325359935e-08, + "loss": 0.8143, + "step": 647 + }, + { + "epoch": 1.882051282051282, + "grad_norm": 0.6451879681808641, + "learning_rate": 6.694695868667556e-08, + "loss": 0.8332, + "step": 648 + }, + { + "epoch": 1.884981684981685, + "grad_norm": 0.6742284480796318, + "learning_rate": 6.307499418253705e-08, + "loss": 0.8388, + "step": 649 + }, + { + "epoch": 1.8879120879120879, + "grad_norm": 0.6205039778686892, + "learning_rate": 5.9317649636088656e-08, + "loss": 0.8091, + "step": 650 + }, + { + "epoch": 1.8908424908424908, + "grad_norm": 0.6580483136532429, + "learning_rate": 5.5675012281119486e-08, + "loss": 0.8195, + "step": 651 + }, + { + "epoch": 1.8937728937728937, + "grad_norm": 0.6485138656930131, + "learning_rate": 5.214716668827558e-08, + "loss": 0.8119, + "step": 652 + }, + { + "epoch": 1.8967032967032966, + "grad_norm": 0.6013349089531645, + "learning_rate": 4.8734194763098706e-08, + "loss": 0.8124, + "step": 653 + }, + { + "epoch": 1.8996336996336995, + "grad_norm": 0.6442373339607835, + "learning_rate": 4.543617574412185e-08, + "loss": 0.7937, + "step": 654 + }, + { + "epoch": 1.9025641025641025, + "grad_norm": 0.6410125740001468, + "learning_rate": 4.2253186201031135e-08, + "loss": 0.8208, + "step": 655 + }, + { + "epoch": 1.9054945054945054, + "grad_norm": 0.6874802512487508, + "learning_rate": 3.9185300032889005e-08, + "loss": 0.7817, + "step": 656 + }, + { + "epoch": 1.9084249084249083, + "grad_norm": 0.6174107918080415, + "learning_rate": 3.6232588466417195e-08, + "loss": 0.8081, + "step": 657 + }, + { + "epoch": 1.9113553113553112, + "grad_norm": 0.6069693810447614, + "learning_rate": 3.339512005434309e-08, + "loss": 0.8471, + "step": 658 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.6571252565902649, + "learning_rate": 3.0672960673808205e-08, + "loss": 0.7734, + "step": 659 + }, + { + "epoch": 1.917216117216117, + "grad_norm": 0.6346303136090871, + "learning_rate": 2.8066173524839978e-08, + "loss": 0.8176, + "step": 660 + }, + { + "epoch": 1.92014652014652, + "grad_norm": 0.6209889706057522, + "learning_rate": 2.5574819128882933e-08, + "loss": 0.8267, + "step": 661 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.7020907999054834, + "learning_rate": 2.319895532739369e-08, + "loss": 0.8058, + "step": 662 + }, + { + "epoch": 1.926007326007326, + "grad_norm": 0.6758173840664298, + "learning_rate": 2.093863728049872e-08, + "loss": 0.8283, + "step": 663 + }, + { + "epoch": 1.928937728937729, + "grad_norm": 0.6385882790006528, + "learning_rate": 1.8793917465713686e-08, + "loss": 0.8188, + "step": 664 + }, + { + "epoch": 1.9318681318681319, + "grad_norm": 0.6438874910701458, + "learning_rate": 1.676484567672554e-08, + "loss": 0.8532, + "step": 665 + }, + { + "epoch": 1.9347985347985348, + "grad_norm": 0.7048203364520762, + "learning_rate": 1.4851469022234e-08, + "loss": 0.822, + "step": 666 + }, + { + "epoch": 1.9377289377289377, + "grad_norm": 0.6322493105177235, + "learning_rate": 1.3053831924861315e-08, + "loss": 0.8076, + "step": 667 + }, + { + "epoch": 1.9406593406593406, + "grad_norm": 0.6610364468359425, + "learning_rate": 1.137197612011809e-08, + "loss": 0.8645, + "step": 668 + }, + { + "epoch": 1.9435897435897436, + "grad_norm": 0.6200915881326821, + "learning_rate": 9.805940655436274e-09, + "loss": 0.7929, + "step": 669 + }, + { + "epoch": 1.9465201465201465, + "grad_norm": 0.6560677264326054, + "learning_rate": 8.35576188926046e-09, + "loss": 0.8775, + "step": 670 + }, + { + "epoch": 1.9494505494505494, + "grad_norm": 0.6631827694048288, + "learning_rate": 7.0214734902074314e-09, + "loss": 0.7636, + "step": 671 + }, + { + "epoch": 1.9523809523809523, + "grad_norm": 0.7712830976554526, + "learning_rate": 5.803106436279571e-09, + "loss": 0.8476, + "step": 672 + }, + { + "epoch": 1.9553113553113555, + "grad_norm": 0.6031190635724369, + "learning_rate": 4.700689014149329e-09, + "loss": 0.7796, + "step": 673 + }, + { + "epoch": 1.9582417582417584, + "grad_norm": 0.5840130736388364, + "learning_rate": 3.71424681850141e-09, + "loss": 0.7594, + "step": 674 + }, + { + "epoch": 1.9611721611721613, + "grad_norm": 0.652441280672971, + "learning_rate": 2.8438027514382517e-09, + "loss": 0.8257, + "step": 675 + }, + { + "epoch": 1.9641025641025642, + "grad_norm": 0.6572849884958872, + "learning_rate": 2.0893770219493347e-09, + "loss": 0.8316, + "step": 676 + }, + { + "epoch": 1.9670329670329672, + "grad_norm": 0.6233858205449643, + "learning_rate": 1.450987145439342e-09, + "loss": 0.8204, + "step": 677 + }, + { + "epoch": 1.96996336996337, + "grad_norm": 0.6331674610122936, + "learning_rate": 9.286479433257e-10, + "loss": 0.7992, + "step": 678 + }, + { + "epoch": 1.972893772893773, + "grad_norm": 0.7160803129297684, + "learning_rate": 5.2237154268997e-10, + "loss": 0.8252, + "step": 679 + }, + { + "epoch": 1.975824175824176, + "grad_norm": 0.7332845117682134, + "learning_rate": 2.321673760002918e-10, + "loss": 0.8221, + "step": 680 + }, + { + "epoch": 1.9787545787545788, + "grad_norm": 0.5902309209793818, + "learning_rate": 5.804218088933944e-11, + "loss": 0.8015, + "step": 681 + }, + { + "epoch": 1.9816849816849818, + "grad_norm": 0.6257835194161055, + "learning_rate": 0.0, + "loss": 0.8461, + "step": 682 + } + ], + "logging_steps": 1, + "max_steps": 682, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 171, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1064222230708224.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}