{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9973915878708834, "eval_steps": 500, "global_step": 2452, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016302575806977503, "grad_norm": 0.7529258131980896, "learning_rate": 0.0, "loss": 0.4678, "step": 1 }, { "epoch": 0.0032605151613955006, "grad_norm": 0.2298814058303833, "learning_rate": 2.1533827903669654e-05, "loss": 0.417, "step": 2 }, { "epoch": 0.00489077274209325, "grad_norm": 0.20245032012462616, "learning_rate": 3.413030972429927e-05, "loss": 0.3776, "step": 3 }, { "epoch": 0.006521030322791001, "grad_norm": 0.1889941245317459, "learning_rate": 4.306765580733931e-05, "loss": 0.3834, "step": 4 }, { "epoch": 0.008151287903488751, "grad_norm": 0.16519580781459808, "learning_rate": 5e-05, "loss": 0.4055, "step": 5 }, { "epoch": 0.0097815454841865, "grad_norm": 0.14732758700847626, "learning_rate": 5.5664137627968925e-05, "loss": 0.3698, "step": 6 }, { "epoch": 0.011411803064884252, "grad_norm": 0.1339130848646164, "learning_rate": 6.0453097756108376e-05, "loss": 0.348, "step": 7 }, { "epoch": 0.013042060645582002, "grad_norm": 0.1276892125606537, "learning_rate": 6.460148371100896e-05, "loss": 0.3469, "step": 8 }, { "epoch": 0.014672318226279752, "grad_norm": 0.13217830657958984, "learning_rate": 6.826061944859854e-05, "loss": 0.351, "step": 9 }, { "epoch": 0.016302575806977502, "grad_norm": 0.13900792598724365, "learning_rate": 7.153382790366967e-05, "loss": 0.33, "step": 10 }, { "epoch": 0.01793283338767525, "grad_norm": 0.1303739994764328, "learning_rate": 7.449480512024892e-05, "loss": 0.3493, "step": 11 }, { "epoch": 0.019563090968373, "grad_norm": 0.14218388497829437, "learning_rate": 7.719796553163858e-05, "loss": 0.3731, "step": 12 }, { "epoch": 0.021193348549070755, "grad_norm": 0.11100596189498901, "learning_rate": 7.968463205835412e-05, "loss": 0.3385, "step": 13 }, { "epoch": 0.022823606129768505, "grad_norm": 0.11745980381965637, "learning_rate": 8.198692565977803e-05, "loss": 0.3337, "step": 14 }, { "epoch": 0.024453863710466255, "grad_norm": 0.11711365729570389, "learning_rate": 8.413030972429928e-05, "loss": 0.3117, "step": 15 }, { "epoch": 0.026084121291164004, "grad_norm": 0.15356451272964478, "learning_rate": 8.613531161467861e-05, "loss": 0.3327, "step": 16 }, { "epoch": 0.027714378871861754, "grad_norm": 0.14109951257705688, "learning_rate": 8.80187213861294e-05, "loss": 0.2942, "step": 17 }, { "epoch": 0.029344636452559504, "grad_norm": 0.25882235169410706, "learning_rate": 8.979444735226819e-05, "loss": 0.316, "step": 18 }, { "epoch": 0.030974894033257254, "grad_norm": 0.157696932554245, "learning_rate": 9.147414002175752e-05, "loss": 0.3284, "step": 19 }, { "epoch": 0.032605151613955004, "grad_norm": 0.13975587487220764, "learning_rate": 9.306765580733931e-05, "loss": 0.3245, "step": 20 }, { "epoch": 0.034235409194652754, "grad_norm": 0.8783023953437805, "learning_rate": 9.458340748040766e-05, "loss": 0.3181, "step": 21 }, { "epoch": 0.0358656667753505, "grad_norm": 0.11985931545495987, "learning_rate": 9.602863302391859e-05, "loss": 0.3098, "step": 22 }, { "epoch": 0.03749592435604825, "grad_norm": 0.14401742815971375, "learning_rate": 9.740960467331899e-05, "loss": 0.3132, "step": 23 }, { "epoch": 0.039126181936746, "grad_norm": 0.17440500855445862, "learning_rate": 9.873179343530825e-05, "loss": 0.3074, "step": 24 }, { "epoch": 0.04075643951744375, "grad_norm": 0.12916715443134308, "learning_rate": 0.0001, "loss": 0.2955, "step": 25 }, { "epoch": 0.04238669709814151, "grad_norm": 0.5290112495422363, "learning_rate": 0.0001, "loss": 0.3143, "step": 26 }, { "epoch": 0.04401695467883926, "grad_norm": 0.1266779601573944, "learning_rate": 0.0001, "loss": 0.2958, "step": 27 }, { "epoch": 0.04564721225953701, "grad_norm": 0.13447235524654388, "learning_rate": 0.0001, "loss": 0.3211, "step": 28 }, { "epoch": 0.04727746984023476, "grad_norm": 0.12066753953695297, "learning_rate": 0.0001, "loss": 0.2894, "step": 29 }, { "epoch": 0.04890772742093251, "grad_norm": 0.11640455573797226, "learning_rate": 0.0001, "loss": 0.2891, "step": 30 }, { "epoch": 0.05053798500163026, "grad_norm": 0.1315324753522873, "learning_rate": 0.0001, "loss": 0.292, "step": 31 }, { "epoch": 0.05216824258232801, "grad_norm": 0.09173787385225296, "learning_rate": 0.0001, "loss": 0.2786, "step": 32 }, { "epoch": 0.05379850016302576, "grad_norm": 0.14253424108028412, "learning_rate": 0.0001, "loss": 0.2924, "step": 33 }, { "epoch": 0.05542875774372351, "grad_norm": 0.1317639797925949, "learning_rate": 0.0001, "loss": 0.29, "step": 34 }, { "epoch": 0.05705901532442126, "grad_norm": 0.11155499517917633, "learning_rate": 0.0001, "loss": 0.2575, "step": 35 }, { "epoch": 0.05868927290511901, "grad_norm": 0.10907690972089767, "learning_rate": 0.0001, "loss": 0.2745, "step": 36 }, { "epoch": 0.06031953048581676, "grad_norm": 0.11807228624820709, "learning_rate": 0.0001, "loss": 0.2713, "step": 37 }, { "epoch": 0.06194978806651451, "grad_norm": 0.1130286455154419, "learning_rate": 0.0001, "loss": 0.2869, "step": 38 }, { "epoch": 0.06358004564721226, "grad_norm": 0.1267092525959015, "learning_rate": 0.0001, "loss": 0.287, "step": 39 }, { "epoch": 0.06521030322791001, "grad_norm": 0.13711895048618317, "learning_rate": 0.0001, "loss": 0.2662, "step": 40 }, { "epoch": 0.06684056080860776, "grad_norm": 0.1149134635925293, "learning_rate": 0.0001, "loss": 0.2743, "step": 41 }, { "epoch": 0.06847081838930551, "grad_norm": 0.1379522979259491, "learning_rate": 0.0001, "loss": 0.2915, "step": 42 }, { "epoch": 0.07010107597000326, "grad_norm": 0.126117542386055, "learning_rate": 0.0001, "loss": 0.2527, "step": 43 }, { "epoch": 0.071731333550701, "grad_norm": 0.13249348104000092, "learning_rate": 0.0001, "loss": 0.2951, "step": 44 }, { "epoch": 0.07336159113139876, "grad_norm": 0.13240788877010345, "learning_rate": 0.0001, "loss": 0.2828, "step": 45 }, { "epoch": 0.0749918487120965, "grad_norm": 0.1182844415307045, "learning_rate": 0.0001, "loss": 0.2564, "step": 46 }, { "epoch": 0.07662210629279426, "grad_norm": 0.13393035531044006, "learning_rate": 0.0001, "loss": 0.2742, "step": 47 }, { "epoch": 0.078252363873492, "grad_norm": 0.11272592842578888, "learning_rate": 0.0001, "loss": 0.279, "step": 48 }, { "epoch": 0.07988262145418976, "grad_norm": 0.12920530140399933, "learning_rate": 0.0001, "loss": 0.2733, "step": 49 }, { "epoch": 0.0815128790348875, "grad_norm": 0.13335202634334564, "learning_rate": 0.0001, "loss": 0.2647, "step": 50 }, { "epoch": 0.08314313661558526, "grad_norm": 0.13447648286819458, "learning_rate": 0.0001, "loss": 0.2674, "step": 51 }, { "epoch": 0.08477339419628302, "grad_norm": 0.13474304974079132, "learning_rate": 0.0001, "loss": 0.274, "step": 52 }, { "epoch": 0.08640365177698077, "grad_norm": 0.15840347111225128, "learning_rate": 0.0001, "loss": 0.2871, "step": 53 }, { "epoch": 0.08803390935767852, "grad_norm": 0.1323927342891693, "learning_rate": 0.0001, "loss": 0.2797, "step": 54 }, { "epoch": 0.08966416693837627, "grad_norm": 0.15678195655345917, "learning_rate": 0.0001, "loss": 0.2742, "step": 55 }, { "epoch": 0.09129442451907402, "grad_norm": 0.1214122325181961, "learning_rate": 0.0001, "loss": 0.258, "step": 56 }, { "epoch": 0.09292468209977177, "grad_norm": 0.15963681042194366, "learning_rate": 0.0001, "loss": 0.2767, "step": 57 }, { "epoch": 0.09455493968046952, "grad_norm": 0.14005902409553528, "learning_rate": 0.0001, "loss": 0.2786, "step": 58 }, { "epoch": 0.09618519726116727, "grad_norm": 0.14053602516651154, "learning_rate": 0.0001, "loss": 0.2673, "step": 59 }, { "epoch": 0.09781545484186502, "grad_norm": 0.1037927120923996, "learning_rate": 0.0001, "loss": 0.2532, "step": 60 }, { "epoch": 0.09944571242256277, "grad_norm": 0.16954319179058075, "learning_rate": 0.0001, "loss": 0.257, "step": 61 }, { "epoch": 0.10107597000326052, "grad_norm": 0.13848423957824707, "learning_rate": 0.0001, "loss": 0.2633, "step": 62 }, { "epoch": 0.10270622758395827, "grad_norm": 0.1805400550365448, "learning_rate": 0.0001, "loss": 0.263, "step": 63 }, { "epoch": 0.10433648516465602, "grad_norm": 0.17179027199745178, "learning_rate": 0.0001, "loss": 0.2681, "step": 64 }, { "epoch": 0.10596674274535377, "grad_norm": 0.1394517719745636, "learning_rate": 0.0001, "loss": 0.27, "step": 65 }, { "epoch": 0.10759700032605152, "grad_norm": 0.1399444192647934, "learning_rate": 0.0001, "loss": 0.262, "step": 66 }, { "epoch": 0.10922725790674927, "grad_norm": 0.1266821324825287, "learning_rate": 0.0001, "loss": 0.2599, "step": 67 }, { "epoch": 0.11085751548744702, "grad_norm": 0.13088282942771912, "learning_rate": 0.0001, "loss": 0.2636, "step": 68 }, { "epoch": 0.11248777306814477, "grad_norm": 0.1161791980266571, "learning_rate": 0.0001, "loss": 0.2739, "step": 69 }, { "epoch": 0.11411803064884252, "grad_norm": 0.1133728101849556, "learning_rate": 0.0001, "loss": 0.2369, "step": 70 }, { "epoch": 0.11574828822954027, "grad_norm": 0.140262633562088, "learning_rate": 0.0001, "loss": 0.2752, "step": 71 }, { "epoch": 0.11737854581023802, "grad_norm": 0.12292502820491791, "learning_rate": 0.0001, "loss": 0.2541, "step": 72 }, { "epoch": 0.11900880339093577, "grad_norm": 0.14118324220180511, "learning_rate": 0.0001, "loss": 0.2611, "step": 73 }, { "epoch": 0.12063906097163352, "grad_norm": 0.12177952378988266, "learning_rate": 0.0001, "loss": 0.2605, "step": 74 }, { "epoch": 0.12226931855233127, "grad_norm": 0.13002616167068481, "learning_rate": 0.0001, "loss": 0.2609, "step": 75 }, { "epoch": 0.12389957613302902, "grad_norm": 0.11555729806423187, "learning_rate": 0.0001, "loss": 0.2404, "step": 76 }, { "epoch": 0.12552983371372678, "grad_norm": 0.1303972601890564, "learning_rate": 0.0001, "loss": 0.251, "step": 77 }, { "epoch": 0.12716009129442452, "grad_norm": 0.1312289834022522, "learning_rate": 0.0001, "loss": 0.2635, "step": 78 }, { "epoch": 0.12879034887512228, "grad_norm": 0.12450554966926575, "learning_rate": 0.0001, "loss": 0.2545, "step": 79 }, { "epoch": 0.13042060645582002, "grad_norm": 0.1366516351699829, "learning_rate": 0.0001, "loss": 0.2566, "step": 80 }, { "epoch": 0.13205086403651778, "grad_norm": 0.13325075805187225, "learning_rate": 0.0001, "loss": 0.2557, "step": 81 }, { "epoch": 0.13368112161721551, "grad_norm": 0.15881724655628204, "learning_rate": 0.0001, "loss": 0.2467, "step": 82 }, { "epoch": 0.13531137919791328, "grad_norm": 0.12714703381061554, "learning_rate": 0.0001, "loss": 0.259, "step": 83 }, { "epoch": 0.13694163677861101, "grad_norm": 0.13727723062038422, "learning_rate": 0.0001, "loss": 0.2737, "step": 84 }, { "epoch": 0.13857189435930878, "grad_norm": 0.12891346216201782, "learning_rate": 0.0001, "loss": 0.2747, "step": 85 }, { "epoch": 0.14020215194000651, "grad_norm": 0.10840713977813721, "learning_rate": 0.0001, "loss": 0.2481, "step": 86 }, { "epoch": 0.14183240952070428, "grad_norm": 0.12393542379140854, "learning_rate": 0.0001, "loss": 0.2547, "step": 87 }, { "epoch": 0.143462667101402, "grad_norm": 0.16842670738697052, "learning_rate": 0.0001, "loss": 0.28, "step": 88 }, { "epoch": 0.14509292468209978, "grad_norm": 0.13346168398857117, "learning_rate": 0.0001, "loss": 0.2359, "step": 89 }, { "epoch": 0.1467231822627975, "grad_norm": 0.14012813568115234, "learning_rate": 0.0001, "loss": 0.242, "step": 90 }, { "epoch": 0.14835343984349528, "grad_norm": 0.1721285581588745, "learning_rate": 0.0001, "loss": 0.2644, "step": 91 }, { "epoch": 0.149983697424193, "grad_norm": 0.14937177300453186, "learning_rate": 0.0001, "loss": 0.2541, "step": 92 }, { "epoch": 0.15161395500489078, "grad_norm": 0.13868822157382965, "learning_rate": 0.0001, "loss": 0.2609, "step": 93 }, { "epoch": 0.1532442125855885, "grad_norm": 0.14626921713352203, "learning_rate": 0.0001, "loss": 0.2442, "step": 94 }, { "epoch": 0.15487447016628628, "grad_norm": 0.13605758547782898, "learning_rate": 0.0001, "loss": 0.2524, "step": 95 }, { "epoch": 0.156504727746984, "grad_norm": 0.1354619562625885, "learning_rate": 0.0001, "loss": 0.2504, "step": 96 }, { "epoch": 0.15813498532768178, "grad_norm": 0.1597517877817154, "learning_rate": 0.0001, "loss": 0.2477, "step": 97 }, { "epoch": 0.1597652429083795, "grad_norm": 0.13010838627815247, "learning_rate": 0.0001, "loss": 0.271, "step": 98 }, { "epoch": 0.16139550048907728, "grad_norm": 0.13291318714618683, "learning_rate": 0.0001, "loss": 0.2447, "step": 99 }, { "epoch": 0.163025758069775, "grad_norm": 0.16925360262393951, "learning_rate": 0.0001, "loss": 0.2617, "step": 100 }, { "epoch": 0.16465601565047278, "grad_norm": 0.13861025869846344, "learning_rate": 0.0001, "loss": 0.2503, "step": 101 }, { "epoch": 0.1662862732311705, "grad_norm": 0.16513262689113617, "learning_rate": 0.0001, "loss": 0.2513, "step": 102 }, { "epoch": 0.16791653081186828, "grad_norm": 0.14727436006069183, "learning_rate": 0.0001, "loss": 0.2769, "step": 103 }, { "epoch": 0.16954678839256604, "grad_norm": 0.14519990980625153, "learning_rate": 0.0001, "loss": 0.2506, "step": 104 }, { "epoch": 0.17117704597326378, "grad_norm": 0.14038777351379395, "learning_rate": 0.0001, "loss": 0.2619, "step": 105 }, { "epoch": 0.17280730355396154, "grad_norm": 0.13004878163337708, "learning_rate": 0.0001, "loss": 0.2466, "step": 106 }, { "epoch": 0.17443756113465927, "grad_norm": 0.11581247299909592, "learning_rate": 0.0001, "loss": 0.2365, "step": 107 }, { "epoch": 0.17606781871535704, "grad_norm": 0.14593157172203064, "learning_rate": 0.0001, "loss": 0.2446, "step": 108 }, { "epoch": 0.17769807629605477, "grad_norm": 0.15348902344703674, "learning_rate": 0.0001, "loss": 0.2613, "step": 109 }, { "epoch": 0.17932833387675254, "grad_norm": 0.11216390132904053, "learning_rate": 0.0001, "loss": 0.2328, "step": 110 }, { "epoch": 0.18095859145745027, "grad_norm": 0.17392517626285553, "learning_rate": 0.0001, "loss": 0.2449, "step": 111 }, { "epoch": 0.18258884903814804, "grad_norm": 0.13363705575466156, "learning_rate": 0.0001, "loss": 0.2577, "step": 112 }, { "epoch": 0.18421910661884577, "grad_norm": 0.12435399740934372, "learning_rate": 0.0001, "loss": 0.2478, "step": 113 }, { "epoch": 0.18584936419954354, "grad_norm": 0.15440337359905243, "learning_rate": 0.0001, "loss": 0.2655, "step": 114 }, { "epoch": 0.18747962178024127, "grad_norm": 0.12815740704536438, "learning_rate": 0.0001, "loss": 0.2642, "step": 115 }, { "epoch": 0.18910987936093904, "grad_norm": 0.11212265491485596, "learning_rate": 0.0001, "loss": 0.2496, "step": 116 }, { "epoch": 0.19074013694163677, "grad_norm": 0.13762663304805756, "learning_rate": 0.0001, "loss": 0.2382, "step": 117 }, { "epoch": 0.19237039452233454, "grad_norm": 0.11899245530366898, "learning_rate": 0.0001, "loss": 0.2473, "step": 118 }, { "epoch": 0.19400065210303227, "grad_norm": 0.12855815887451172, "learning_rate": 0.0001, "loss": 0.2478, "step": 119 }, { "epoch": 0.19563090968373004, "grad_norm": 0.116991326212883, "learning_rate": 0.0001, "loss": 0.268, "step": 120 }, { "epoch": 0.19726116726442777, "grad_norm": 0.11379344016313553, "learning_rate": 0.0001, "loss": 0.2391, "step": 121 }, { "epoch": 0.19889142484512554, "grad_norm": 0.12145145982503891, "learning_rate": 0.0001, "loss": 0.2433, "step": 122 }, { "epoch": 0.20052168242582327, "grad_norm": 0.12102972716093063, "learning_rate": 0.0001, "loss": 0.2384, "step": 123 }, { "epoch": 0.20215194000652104, "grad_norm": 0.13323700428009033, "learning_rate": 0.0001, "loss": 0.2393, "step": 124 }, { "epoch": 0.20378219758721877, "grad_norm": 0.13560782372951508, "learning_rate": 0.0001, "loss": 0.2434, "step": 125 }, { "epoch": 0.20541245516791654, "grad_norm": 0.14215120673179626, "learning_rate": 0.0001, "loss": 0.2478, "step": 126 }, { "epoch": 0.20704271274861427, "grad_norm": 0.11859458684921265, "learning_rate": 0.0001, "loss": 0.2274, "step": 127 }, { "epoch": 0.20867297032931204, "grad_norm": 0.12831608951091766, "learning_rate": 0.0001, "loss": 0.2625, "step": 128 }, { "epoch": 0.21030322791000977, "grad_norm": 0.15069334208965302, "learning_rate": 0.0001, "loss": 0.2542, "step": 129 }, { "epoch": 0.21193348549070753, "grad_norm": 0.13063862919807434, "learning_rate": 0.0001, "loss": 0.2428, "step": 130 }, { "epoch": 0.21356374307140527, "grad_norm": 0.12685725092887878, "learning_rate": 0.0001, "loss": 0.2312, "step": 131 }, { "epoch": 0.21519400065210303, "grad_norm": 0.13156022131443024, "learning_rate": 0.0001, "loss": 0.2548, "step": 132 }, { "epoch": 0.21682425823280077, "grad_norm": 0.12375540286302567, "learning_rate": 0.0001, "loss": 0.241, "step": 133 }, { "epoch": 0.21845451581349853, "grad_norm": 0.12112291157245636, "learning_rate": 0.0001, "loss": 0.2483, "step": 134 }, { "epoch": 0.22008477339419627, "grad_norm": 0.13035452365875244, "learning_rate": 0.0001, "loss": 0.2418, "step": 135 }, { "epoch": 0.22171503097489403, "grad_norm": 0.14445112645626068, "learning_rate": 0.0001, "loss": 0.2613, "step": 136 }, { "epoch": 0.22334528855559177, "grad_norm": 0.11515045166015625, "learning_rate": 0.0001, "loss": 0.2423, "step": 137 }, { "epoch": 0.22497554613628953, "grad_norm": 0.1339021772146225, "learning_rate": 0.0001, "loss": 0.2462, "step": 138 }, { "epoch": 0.2266058037169873, "grad_norm": 0.14942044019699097, "learning_rate": 0.0001, "loss": 0.2289, "step": 139 }, { "epoch": 0.22823606129768503, "grad_norm": 0.15505343675613403, "learning_rate": 0.0001, "loss": 0.2507, "step": 140 }, { "epoch": 0.2298663188783828, "grad_norm": 0.15206627547740936, "learning_rate": 0.0001, "loss": 0.2466, "step": 141 }, { "epoch": 0.23149657645908053, "grad_norm": 0.21628950536251068, "learning_rate": 0.0001, "loss": 0.2506, "step": 142 }, { "epoch": 0.2331268340397783, "grad_norm": 0.146853506565094, "learning_rate": 0.0001, "loss": 0.2378, "step": 143 }, { "epoch": 0.23475709162047603, "grad_norm": 0.13807877898216248, "learning_rate": 0.0001, "loss": 0.2304, "step": 144 }, { "epoch": 0.2363873492011738, "grad_norm": 0.1247410997748375, "learning_rate": 0.0001, "loss": 0.2154, "step": 145 }, { "epoch": 0.23801760678187153, "grad_norm": 0.11460768431425095, "learning_rate": 0.0001, "loss": 0.2539, "step": 146 }, { "epoch": 0.2396478643625693, "grad_norm": 0.13893291354179382, "learning_rate": 0.0001, "loss": 0.2265, "step": 147 }, { "epoch": 0.24127812194326703, "grad_norm": 0.13536906242370605, "learning_rate": 0.0001, "loss": 0.2267, "step": 148 }, { "epoch": 0.2429083795239648, "grad_norm": 0.14529214799404144, "learning_rate": 0.0001, "loss": 0.2559, "step": 149 }, { "epoch": 0.24453863710466253, "grad_norm": 0.13840830326080322, "learning_rate": 0.0001, "loss": 0.2353, "step": 150 }, { "epoch": 0.2461688946853603, "grad_norm": 0.15911462903022766, "learning_rate": 0.0001, "loss": 0.2527, "step": 151 }, { "epoch": 0.24779915226605803, "grad_norm": 0.11530350893735886, "learning_rate": 0.0001, "loss": 0.2568, "step": 152 }, { "epoch": 0.2494294098467558, "grad_norm": 0.14135918021202087, "learning_rate": 0.0001, "loss": 0.2266, "step": 153 }, { "epoch": 0.25105966742745356, "grad_norm": 0.18591678142547607, "learning_rate": 0.0001, "loss": 0.2593, "step": 154 }, { "epoch": 0.2526899250081513, "grad_norm": 0.13683144748210907, "learning_rate": 0.0001, "loss": 0.2469, "step": 155 }, { "epoch": 0.25432018258884903, "grad_norm": 0.17030082643032074, "learning_rate": 0.0001, "loss": 0.2413, "step": 156 }, { "epoch": 0.25595044016954677, "grad_norm": 0.1563073843717575, "learning_rate": 0.0001, "loss": 0.2435, "step": 157 }, { "epoch": 0.25758069775024456, "grad_norm": 0.129858136177063, "learning_rate": 0.0001, "loss": 0.2515, "step": 158 }, { "epoch": 0.2592109553309423, "grad_norm": 0.13964857161045074, "learning_rate": 0.0001, "loss": 0.2271, "step": 159 }, { "epoch": 0.26084121291164003, "grad_norm": 0.12167726457118988, "learning_rate": 0.0001, "loss": 0.2092, "step": 160 }, { "epoch": 0.26247147049233777, "grad_norm": 0.12050545960664749, "learning_rate": 0.0001, "loss": 0.2321, "step": 161 }, { "epoch": 0.26410172807303556, "grad_norm": 0.14696434140205383, "learning_rate": 0.0001, "loss": 0.2545, "step": 162 }, { "epoch": 0.2657319856537333, "grad_norm": 0.10441072285175323, "learning_rate": 0.0001, "loss": 0.2347, "step": 163 }, { "epoch": 0.26736224323443103, "grad_norm": 0.11823071539402008, "learning_rate": 0.0001, "loss": 0.2242, "step": 164 }, { "epoch": 0.26899250081512877, "grad_norm": 0.1426367461681366, "learning_rate": 0.0001, "loss": 0.2614, "step": 165 }, { "epoch": 0.27062275839582656, "grad_norm": 0.11903452128171921, "learning_rate": 0.0001, "loss": 0.2469, "step": 166 }, { "epoch": 0.2722530159765243, "grad_norm": 0.12051168829202652, "learning_rate": 0.0001, "loss": 0.246, "step": 167 }, { "epoch": 0.27388327355722203, "grad_norm": 0.1146332174539566, "learning_rate": 0.0001, "loss": 0.2485, "step": 168 }, { "epoch": 0.27551353113791976, "grad_norm": 0.1366330087184906, "learning_rate": 0.0001, "loss": 0.2364, "step": 169 }, { "epoch": 0.27714378871861756, "grad_norm": 0.19316819310188293, "learning_rate": 0.0001, "loss": 0.2373, "step": 170 }, { "epoch": 0.2787740462993153, "grad_norm": 0.13915376365184784, "learning_rate": 0.0001, "loss": 0.2354, "step": 171 }, { "epoch": 0.28040430388001303, "grad_norm": 0.16246309876441956, "learning_rate": 0.0001, "loss": 0.2452, "step": 172 }, { "epoch": 0.2820345614607108, "grad_norm": 0.11810585111379623, "learning_rate": 0.0001, "loss": 0.2312, "step": 173 }, { "epoch": 0.28366481904140856, "grad_norm": 0.15839870274066925, "learning_rate": 0.0001, "loss": 0.2464, "step": 174 }, { "epoch": 0.2852950766221063, "grad_norm": 0.1045420840382576, "learning_rate": 0.0001, "loss": 0.2257, "step": 175 }, { "epoch": 0.286925334202804, "grad_norm": 0.14196939766407013, "learning_rate": 0.0001, "loss": 0.2523, "step": 176 }, { "epoch": 0.2885555917835018, "grad_norm": 0.14301368594169617, "learning_rate": 0.0001, "loss": 0.2387, "step": 177 }, { "epoch": 0.29018584936419956, "grad_norm": 0.13265396654605865, "learning_rate": 0.0001, "loss": 0.2428, "step": 178 }, { "epoch": 0.2918161069448973, "grad_norm": 0.126626119017601, "learning_rate": 0.0001, "loss": 0.2494, "step": 179 }, { "epoch": 0.293446364525595, "grad_norm": 0.13440294563770294, "learning_rate": 0.0001, "loss": 0.2457, "step": 180 }, { "epoch": 0.2950766221062928, "grad_norm": 0.1397274285554886, "learning_rate": 0.0001, "loss": 0.2274, "step": 181 }, { "epoch": 0.29670687968699055, "grad_norm": 0.09933532774448395, "learning_rate": 0.0001, "loss": 0.225, "step": 182 }, { "epoch": 0.2983371372676883, "grad_norm": 0.16503044962882996, "learning_rate": 0.0001, "loss": 0.2293, "step": 183 }, { "epoch": 0.299967394848386, "grad_norm": 0.12690463662147522, "learning_rate": 0.0001, "loss": 0.2447, "step": 184 }, { "epoch": 0.3015976524290838, "grad_norm": 0.15822118520736694, "learning_rate": 0.0001, "loss": 0.2492, "step": 185 }, { "epoch": 0.30322791000978155, "grad_norm": 0.10334572941064835, "learning_rate": 0.0001, "loss": 0.2334, "step": 186 }, { "epoch": 0.3048581675904793, "grad_norm": 0.12315747141838074, "learning_rate": 0.0001, "loss": 0.2335, "step": 187 }, { "epoch": 0.306488425171177, "grad_norm": 0.13603512942790985, "learning_rate": 0.0001, "loss": 0.228, "step": 188 }, { "epoch": 0.3081186827518748, "grad_norm": 0.1011793464422226, "learning_rate": 0.0001, "loss": 0.2224, "step": 189 }, { "epoch": 0.30974894033257255, "grad_norm": 0.10281293839216232, "learning_rate": 0.0001, "loss": 0.2277, "step": 190 }, { "epoch": 0.3113791979132703, "grad_norm": 0.1417594999074936, "learning_rate": 0.0001, "loss": 0.2516, "step": 191 }, { "epoch": 0.313009455493968, "grad_norm": 0.11304276436567307, "learning_rate": 0.0001, "loss": 0.2323, "step": 192 }, { "epoch": 0.3146397130746658, "grad_norm": 0.15146027505397797, "learning_rate": 0.0001, "loss": 0.2446, "step": 193 }, { "epoch": 0.31626997065536355, "grad_norm": 0.11567169427871704, "learning_rate": 0.0001, "loss": 0.2018, "step": 194 }, { "epoch": 0.3179002282360613, "grad_norm": 0.12554962933063507, "learning_rate": 0.0001, "loss": 0.2317, "step": 195 }, { "epoch": 0.319530485816759, "grad_norm": 0.15784506499767303, "learning_rate": 0.0001, "loss": 0.2372, "step": 196 }, { "epoch": 0.3211607433974568, "grad_norm": 0.13723263144493103, "learning_rate": 0.0001, "loss": 0.2395, "step": 197 }, { "epoch": 0.32279100097815455, "grad_norm": 0.16114649176597595, "learning_rate": 0.0001, "loss": 0.244, "step": 198 }, { "epoch": 0.3244212585588523, "grad_norm": 0.11904025822877884, "learning_rate": 0.0001, "loss": 0.2366, "step": 199 }, { "epoch": 0.32605151613955, "grad_norm": 0.1232055053114891, "learning_rate": 0.0001, "loss": 0.2266, "step": 200 }, { "epoch": 0.3276817737202478, "grad_norm": 0.1447722166776657, "learning_rate": 0.0001, "loss": 0.2451, "step": 201 }, { "epoch": 0.32931203130094555, "grad_norm": 0.10923551023006439, "learning_rate": 0.0001, "loss": 0.2242, "step": 202 }, { "epoch": 0.3309422888816433, "grad_norm": 0.13391102850437164, "learning_rate": 0.0001, "loss": 0.2342, "step": 203 }, { "epoch": 0.332572546462341, "grad_norm": 0.14067484438419342, "learning_rate": 0.0001, "loss": 0.2334, "step": 204 }, { "epoch": 0.3342028040430388, "grad_norm": 0.13190071284770966, "learning_rate": 0.0001, "loss": 0.2398, "step": 205 }, { "epoch": 0.33583306162373655, "grad_norm": 0.12358011305332184, "learning_rate": 0.0001, "loss": 0.2218, "step": 206 }, { "epoch": 0.3374633192044343, "grad_norm": 0.11492815613746643, "learning_rate": 0.0001, "loss": 0.2284, "step": 207 }, { "epoch": 0.3390935767851321, "grad_norm": 0.11512556672096252, "learning_rate": 0.0001, "loss": 0.2265, "step": 208 }, { "epoch": 0.3407238343658298, "grad_norm": 0.11229848861694336, "learning_rate": 0.0001, "loss": 0.2253, "step": 209 }, { "epoch": 0.34235409194652755, "grad_norm": 0.11124417930841446, "learning_rate": 0.0001, "loss": 0.2449, "step": 210 }, { "epoch": 0.3439843495272253, "grad_norm": 0.11999543011188507, "learning_rate": 0.0001, "loss": 0.2255, "step": 211 }, { "epoch": 0.3456146071079231, "grad_norm": 0.10648955404758453, "learning_rate": 0.0001, "loss": 0.2104, "step": 212 }, { "epoch": 0.3472448646886208, "grad_norm": 0.12528495490550995, "learning_rate": 0.0001, "loss": 0.2459, "step": 213 }, { "epoch": 0.34887512226931855, "grad_norm": 0.12371502816677094, "learning_rate": 0.0001, "loss": 0.2483, "step": 214 }, { "epoch": 0.3505053798500163, "grad_norm": 0.15203551948070526, "learning_rate": 0.0001, "loss": 0.23, "step": 215 }, { "epoch": 0.3521356374307141, "grad_norm": 0.12010787427425385, "learning_rate": 0.0001, "loss": 0.2221, "step": 216 }, { "epoch": 0.3537658950114118, "grad_norm": 0.1346224844455719, "learning_rate": 0.0001, "loss": 0.2268, "step": 217 }, { "epoch": 0.35539615259210955, "grad_norm": 0.15142722427845, "learning_rate": 0.0001, "loss": 0.2338, "step": 218 }, { "epoch": 0.3570264101728073, "grad_norm": 0.129653200507164, "learning_rate": 0.0001, "loss": 0.2325, "step": 219 }, { "epoch": 0.3586566677535051, "grad_norm": 0.13703054189682007, "learning_rate": 0.0001, "loss": 0.2278, "step": 220 }, { "epoch": 0.3602869253342028, "grad_norm": 0.1323288083076477, "learning_rate": 0.0001, "loss": 0.2331, "step": 221 }, { "epoch": 0.36191718291490055, "grad_norm": 0.14150388538837433, "learning_rate": 0.0001, "loss": 0.2408, "step": 222 }, { "epoch": 0.3635474404955983, "grad_norm": 0.16232647001743317, "learning_rate": 0.0001, "loss": 0.2228, "step": 223 }, { "epoch": 0.3651776980762961, "grad_norm": 0.13750429451465607, "learning_rate": 0.0001, "loss": 0.2578, "step": 224 }, { "epoch": 0.3668079556569938, "grad_norm": 0.15641643106937408, "learning_rate": 0.0001, "loss": 0.2202, "step": 225 }, { "epoch": 0.36843821323769155, "grad_norm": 0.14466796815395355, "learning_rate": 0.0001, "loss": 0.2148, "step": 226 }, { "epoch": 0.3700684708183893, "grad_norm": 0.11243683099746704, "learning_rate": 0.0001, "loss": 0.2353, "step": 227 }, { "epoch": 0.3716987283990871, "grad_norm": 0.12383094429969788, "learning_rate": 0.0001, "loss": 0.2254, "step": 228 }, { "epoch": 0.3733289859797848, "grad_norm": 0.11095986515283585, "learning_rate": 0.0001, "loss": 0.225, "step": 229 }, { "epoch": 0.37495924356048255, "grad_norm": 0.12428440898656845, "learning_rate": 0.0001, "loss": 0.233, "step": 230 }, { "epoch": 0.3765895011411803, "grad_norm": 0.14329205453395844, "learning_rate": 0.0001, "loss": 0.2307, "step": 231 }, { "epoch": 0.3782197587218781, "grad_norm": 0.14026683568954468, "learning_rate": 0.0001, "loss": 0.2139, "step": 232 }, { "epoch": 0.3798500163025758, "grad_norm": 0.11123739928007126, "learning_rate": 0.0001, "loss": 0.2191, "step": 233 }, { "epoch": 0.38148027388327355, "grad_norm": 0.1463911086320877, "learning_rate": 0.0001, "loss": 0.2388, "step": 234 }, { "epoch": 0.3831105314639713, "grad_norm": 0.13069649040699005, "learning_rate": 0.0001, "loss": 0.2185, "step": 235 }, { "epoch": 0.3847407890446691, "grad_norm": 0.14931800961494446, "learning_rate": 0.0001, "loss": 0.2416, "step": 236 }, { "epoch": 0.3863710466253668, "grad_norm": 0.16097313165664673, "learning_rate": 0.0001, "loss": 0.2347, "step": 237 }, { "epoch": 0.38800130420606455, "grad_norm": 0.15226350724697113, "learning_rate": 0.0001, "loss": 0.2258, "step": 238 }, { "epoch": 0.3896315617867623, "grad_norm": 0.13384918868541718, "learning_rate": 0.0001, "loss": 0.2265, "step": 239 }, { "epoch": 0.3912618193674601, "grad_norm": 0.11753430217504501, "learning_rate": 0.0001, "loss": 0.2106, "step": 240 }, { "epoch": 0.3928920769481578, "grad_norm": 0.1108139306306839, "learning_rate": 0.0001, "loss": 0.2293, "step": 241 }, { "epoch": 0.39452233452885554, "grad_norm": 0.10754958540201187, "learning_rate": 0.0001, "loss": 0.2251, "step": 242 }, { "epoch": 0.39615259210955334, "grad_norm": 0.11824717372655869, "learning_rate": 0.0001, "loss": 0.2507, "step": 243 }, { "epoch": 0.3977828496902511, "grad_norm": 0.14322006702423096, "learning_rate": 0.0001, "loss": 0.2348, "step": 244 }, { "epoch": 0.3994131072709488, "grad_norm": 0.10835988074541092, "learning_rate": 0.0001, "loss": 0.2141, "step": 245 }, { "epoch": 0.40104336485164654, "grad_norm": 0.11391481757164001, "learning_rate": 0.0001, "loss": 0.2139, "step": 246 }, { "epoch": 0.40267362243234434, "grad_norm": 0.11263515055179596, "learning_rate": 0.0001, "loss": 0.2354, "step": 247 }, { "epoch": 0.40430388001304207, "grad_norm": 0.12588317692279816, "learning_rate": 0.0001, "loss": 0.2354, "step": 248 }, { "epoch": 0.4059341375937398, "grad_norm": 0.12664109468460083, "learning_rate": 0.0001, "loss": 0.2371, "step": 249 }, { "epoch": 0.40756439517443754, "grad_norm": 0.12330986559391022, "learning_rate": 0.0001, "loss": 0.2338, "step": 250 }, { "epoch": 0.40919465275513534, "grad_norm": 0.11349525302648544, "learning_rate": 0.0001, "loss": 0.2315, "step": 251 }, { "epoch": 0.41082491033583307, "grad_norm": 0.11793797463178635, "learning_rate": 0.0001, "loss": 0.2309, "step": 252 }, { "epoch": 0.4124551679165308, "grad_norm": 0.14245276153087616, "learning_rate": 0.0001, "loss": 0.2469, "step": 253 }, { "epoch": 0.41408542549722854, "grad_norm": 0.11353014409542084, "learning_rate": 0.0001, "loss": 0.2238, "step": 254 }, { "epoch": 0.41571568307792633, "grad_norm": 0.13224811851978302, "learning_rate": 0.0001, "loss": 0.2419, "step": 255 }, { "epoch": 0.41734594065862407, "grad_norm": 0.10972931236028671, "learning_rate": 0.0001, "loss": 0.2139, "step": 256 }, { "epoch": 0.4189761982393218, "grad_norm": 0.17038947343826294, "learning_rate": 0.0001, "loss": 0.2233, "step": 257 }, { "epoch": 0.42060645582001954, "grad_norm": 0.1390901654958725, "learning_rate": 0.0001, "loss": 0.2347, "step": 258 }, { "epoch": 0.42223671340071733, "grad_norm": 0.10498136281967163, "learning_rate": 0.0001, "loss": 0.2185, "step": 259 }, { "epoch": 0.42386697098141507, "grad_norm": 0.11392710357904434, "learning_rate": 0.0001, "loss": 0.2317, "step": 260 }, { "epoch": 0.4254972285621128, "grad_norm": 0.12423884123563766, "learning_rate": 0.0001, "loss": 0.238, "step": 261 }, { "epoch": 0.42712748614281054, "grad_norm": 0.12265395373106003, "learning_rate": 0.0001, "loss": 0.2151, "step": 262 }, { "epoch": 0.42875774372350833, "grad_norm": 0.1066151112318039, "learning_rate": 0.0001, "loss": 0.2188, "step": 263 }, { "epoch": 0.43038800130420607, "grad_norm": 0.10593073070049286, "learning_rate": 0.0001, "loss": 0.2179, "step": 264 }, { "epoch": 0.4320182588849038, "grad_norm": 0.12529350817203522, "learning_rate": 0.0001, "loss": 0.2366, "step": 265 }, { "epoch": 0.43364851646560154, "grad_norm": 0.13577479124069214, "learning_rate": 0.0001, "loss": 0.2304, "step": 266 }, { "epoch": 0.43527877404629933, "grad_norm": 0.15030768513679504, "learning_rate": 0.0001, "loss": 0.2421, "step": 267 }, { "epoch": 0.43690903162699707, "grad_norm": 0.12566924095153809, "learning_rate": 0.0001, "loss": 0.2366, "step": 268 }, { "epoch": 0.4385392892076948, "grad_norm": 0.1277414709329605, "learning_rate": 0.0001, "loss": 0.2413, "step": 269 }, { "epoch": 0.44016954678839254, "grad_norm": 0.12140754610300064, "learning_rate": 0.0001, "loss": 0.2072, "step": 270 }, { "epoch": 0.44179980436909033, "grad_norm": 0.16660185158252716, "learning_rate": 0.0001, "loss": 0.2513, "step": 271 }, { "epoch": 0.44343006194978807, "grad_norm": 0.11619671434164047, "learning_rate": 0.0001, "loss": 0.2158, "step": 272 }, { "epoch": 0.4450603195304858, "grad_norm": 0.16923661530017853, "learning_rate": 0.0001, "loss": 0.2479, "step": 273 }, { "epoch": 0.44669057711118354, "grad_norm": 0.14053957164287567, "learning_rate": 0.0001, "loss": 0.2383, "step": 274 }, { "epoch": 0.44832083469188133, "grad_norm": 0.1332424134016037, "learning_rate": 0.0001, "loss": 0.2202, "step": 275 }, { "epoch": 0.44995109227257907, "grad_norm": 0.10877622663974762, "learning_rate": 0.0001, "loss": 0.234, "step": 276 }, { "epoch": 0.4515813498532768, "grad_norm": 0.11536753922700882, "learning_rate": 0.0001, "loss": 0.2199, "step": 277 }, { "epoch": 0.4532116074339746, "grad_norm": 0.12691466510295868, "learning_rate": 0.0001, "loss": 0.2391, "step": 278 }, { "epoch": 0.45484186501467233, "grad_norm": 0.12029401957988739, "learning_rate": 0.0001, "loss": 0.2258, "step": 279 }, { "epoch": 0.45647212259537007, "grad_norm": 0.14181189239025116, "learning_rate": 0.0001, "loss": 0.2202, "step": 280 }, { "epoch": 0.4581023801760678, "grad_norm": 0.1265992522239685, "learning_rate": 0.0001, "loss": 0.2297, "step": 281 }, { "epoch": 0.4597326377567656, "grad_norm": 0.1276220828294754, "learning_rate": 0.0001, "loss": 0.2282, "step": 282 }, { "epoch": 0.46136289533746333, "grad_norm": 0.15334482491016388, "learning_rate": 0.0001, "loss": 0.2213, "step": 283 }, { "epoch": 0.46299315291816107, "grad_norm": 0.14251714944839478, "learning_rate": 0.0001, "loss": 0.2244, "step": 284 }, { "epoch": 0.4646234104988588, "grad_norm": 0.1373746544122696, "learning_rate": 0.0001, "loss": 0.2258, "step": 285 }, { "epoch": 0.4662536680795566, "grad_norm": 0.14110830426216125, "learning_rate": 0.0001, "loss": 0.2192, "step": 286 }, { "epoch": 0.46788392566025433, "grad_norm": 0.12478266656398773, "learning_rate": 0.0001, "loss": 0.2256, "step": 287 }, { "epoch": 0.46951418324095207, "grad_norm": 0.13609923422336578, "learning_rate": 0.0001, "loss": 0.2273, "step": 288 }, { "epoch": 0.4711444408216498, "grad_norm": 0.15859074890613556, "learning_rate": 0.0001, "loss": 0.2314, "step": 289 }, { "epoch": 0.4727746984023476, "grad_norm": 0.1267993301153183, "learning_rate": 0.0001, "loss": 0.2431, "step": 290 }, { "epoch": 0.47440495598304533, "grad_norm": 0.1309819370508194, "learning_rate": 0.0001, "loss": 0.2036, "step": 291 }, { "epoch": 0.47603521356374306, "grad_norm": 0.14452795684337616, "learning_rate": 0.0001, "loss": 0.2113, "step": 292 }, { "epoch": 0.4776654711444408, "grad_norm": 0.10817385464906693, "learning_rate": 0.0001, "loss": 0.2198, "step": 293 }, { "epoch": 0.4792957287251386, "grad_norm": 0.10400962829589844, "learning_rate": 0.0001, "loss": 0.2107, "step": 294 }, { "epoch": 0.48092598630583633, "grad_norm": 0.14228041470050812, "learning_rate": 0.0001, "loss": 0.2355, "step": 295 }, { "epoch": 0.48255624388653406, "grad_norm": 0.144633486866951, "learning_rate": 0.0001, "loss": 0.237, "step": 296 }, { "epoch": 0.4841865014672318, "grad_norm": 0.14238397777080536, "learning_rate": 0.0001, "loss": 0.2299, "step": 297 }, { "epoch": 0.4858167590479296, "grad_norm": 0.12710346281528473, "learning_rate": 0.0001, "loss": 0.2267, "step": 298 }, { "epoch": 0.4874470166286273, "grad_norm": 0.12529858946800232, "learning_rate": 0.0001, "loss": 0.2301, "step": 299 }, { "epoch": 0.48907727420932506, "grad_norm": 0.12333487719297409, "learning_rate": 0.0001, "loss": 0.2457, "step": 300 }, { "epoch": 0.4907075317900228, "grad_norm": 0.10910045355558395, "learning_rate": 0.0001, "loss": 0.2179, "step": 301 }, { "epoch": 0.4923377893707206, "grad_norm": 0.13991764187812805, "learning_rate": 0.0001, "loss": 0.2361, "step": 302 }, { "epoch": 0.4939680469514183, "grad_norm": 0.1286964863538742, "learning_rate": 0.0001, "loss": 0.2172, "step": 303 }, { "epoch": 0.49559830453211606, "grad_norm": 0.13012994825839996, "learning_rate": 0.0001, "loss": 0.199, "step": 304 }, { "epoch": 0.4972285621128138, "grad_norm": 0.12035807967185974, "learning_rate": 0.0001, "loss": 0.232, "step": 305 }, { "epoch": 0.4988588196935116, "grad_norm": 0.1246248111128807, "learning_rate": 0.0001, "loss": 0.2271, "step": 306 }, { "epoch": 0.5004890772742093, "grad_norm": 0.13514195382595062, "learning_rate": 0.0001, "loss": 0.2089, "step": 307 }, { "epoch": 0.5021193348549071, "grad_norm": 0.1276530623435974, "learning_rate": 0.0001, "loss": 0.2282, "step": 308 }, { "epoch": 0.5037495924356048, "grad_norm": 0.12651173770427704, "learning_rate": 0.0001, "loss": 0.2235, "step": 309 }, { "epoch": 0.5053798500163026, "grad_norm": 0.12165319919586182, "learning_rate": 0.0001, "loss": 0.2283, "step": 310 }, { "epoch": 0.5070101075970004, "grad_norm": 0.10687411576509476, "learning_rate": 0.0001, "loss": 0.2285, "step": 311 }, { "epoch": 0.5086403651776981, "grad_norm": 0.10170154273509979, "learning_rate": 0.0001, "loss": 0.2165, "step": 312 }, { "epoch": 0.5102706227583959, "grad_norm": 0.130455881357193, "learning_rate": 0.0001, "loss": 0.2236, "step": 313 }, { "epoch": 0.5119008803390935, "grad_norm": 0.12910182774066925, "learning_rate": 0.0001, "loss": 0.2339, "step": 314 }, { "epoch": 0.5135311379197913, "grad_norm": 0.12536931037902832, "learning_rate": 0.0001, "loss": 0.225, "step": 315 }, { "epoch": 0.5151613955004891, "grad_norm": 0.11117815226316452, "learning_rate": 0.0001, "loss": 0.2167, "step": 316 }, { "epoch": 0.5167916530811868, "grad_norm": 0.10664685070514679, "learning_rate": 0.0001, "loss": 0.2132, "step": 317 }, { "epoch": 0.5184219106618846, "grad_norm": 0.11488650739192963, "learning_rate": 0.0001, "loss": 0.2032, "step": 318 }, { "epoch": 0.5200521682425824, "grad_norm": 0.1376049518585205, "learning_rate": 0.0001, "loss": 0.2198, "step": 319 }, { "epoch": 0.5216824258232801, "grad_norm": 0.11661184579133987, "learning_rate": 0.0001, "loss": 0.2225, "step": 320 }, { "epoch": 0.5233126834039779, "grad_norm": 0.15519613027572632, "learning_rate": 0.0001, "loss": 0.2195, "step": 321 }, { "epoch": 0.5249429409846755, "grad_norm": 0.13077083230018616, "learning_rate": 0.0001, "loss": 0.2413, "step": 322 }, { "epoch": 0.5265731985653733, "grad_norm": 0.13975407183170319, "learning_rate": 0.0001, "loss": 0.2307, "step": 323 }, { "epoch": 0.5282034561460711, "grad_norm": 0.12319333851337433, "learning_rate": 0.0001, "loss": 0.2122, "step": 324 }, { "epoch": 0.5298337137267688, "grad_norm": 0.12614178657531738, "learning_rate": 0.0001, "loss": 0.2126, "step": 325 }, { "epoch": 0.5314639713074666, "grad_norm": 0.12005390971899033, "learning_rate": 0.0001, "loss": 0.2261, "step": 326 }, { "epoch": 0.5330942288881644, "grad_norm": 0.14331591129302979, "learning_rate": 0.0001, "loss": 0.2204, "step": 327 }, { "epoch": 0.5347244864688621, "grad_norm": 0.1153935119509697, "learning_rate": 0.0001, "loss": 0.2221, "step": 328 }, { "epoch": 0.5363547440495599, "grad_norm": 0.14517177641391754, "learning_rate": 0.0001, "loss": 0.2324, "step": 329 }, { "epoch": 0.5379850016302575, "grad_norm": 0.1347828060388565, "learning_rate": 0.0001, "loss": 0.2157, "step": 330 }, { "epoch": 0.5396152592109553, "grad_norm": 0.12938876450061798, "learning_rate": 0.0001, "loss": 0.221, "step": 331 }, { "epoch": 0.5412455167916531, "grad_norm": 0.12898674607276917, "learning_rate": 0.0001, "loss": 0.2243, "step": 332 }, { "epoch": 0.5428757743723508, "grad_norm": 0.1460563838481903, "learning_rate": 0.0001, "loss": 0.2091, "step": 333 }, { "epoch": 0.5445060319530486, "grad_norm": 0.12404835224151611, "learning_rate": 0.0001, "loss": 0.2299, "step": 334 }, { "epoch": 0.5461362895337464, "grad_norm": 0.10986137390136719, "learning_rate": 0.0001, "loss": 0.2119, "step": 335 }, { "epoch": 0.5477665471144441, "grad_norm": 0.10479142516851425, "learning_rate": 0.0001, "loss": 0.2132, "step": 336 }, { "epoch": 0.5493968046951418, "grad_norm": 0.14422518014907837, "learning_rate": 0.0001, "loss": 0.2379, "step": 337 }, { "epoch": 0.5510270622758395, "grad_norm": 0.13756605982780457, "learning_rate": 0.0001, "loss": 0.2205, "step": 338 }, { "epoch": 0.5526573198565373, "grad_norm": 0.15542298555374146, "learning_rate": 0.0001, "loss": 0.227, "step": 339 }, { "epoch": 0.5542875774372351, "grad_norm": 0.11967030167579651, "learning_rate": 0.0001, "loss": 0.2336, "step": 340 }, { "epoch": 0.5559178350179328, "grad_norm": 0.11930400133132935, "learning_rate": 0.0001, "loss": 0.2273, "step": 341 }, { "epoch": 0.5575480925986306, "grad_norm": 0.13011619448661804, "learning_rate": 0.0001, "loss": 0.2386, "step": 342 }, { "epoch": 0.5591783501793284, "grad_norm": 0.1742897927761078, "learning_rate": 0.0001, "loss": 0.2248, "step": 343 }, { "epoch": 0.5608086077600261, "grad_norm": 0.09916642308235168, "learning_rate": 0.0001, "loss": 0.2169, "step": 344 }, { "epoch": 0.5624388653407238, "grad_norm": 0.11124306917190552, "learning_rate": 0.0001, "loss": 0.2173, "step": 345 }, { "epoch": 0.5640691229214216, "grad_norm": 0.13426317274570465, "learning_rate": 0.0001, "loss": 0.2235, "step": 346 }, { "epoch": 0.5656993805021193, "grad_norm": 0.12277089059352875, "learning_rate": 0.0001, "loss": 0.2234, "step": 347 }, { "epoch": 0.5673296380828171, "grad_norm": 0.1190582811832428, "learning_rate": 0.0001, "loss": 0.2173, "step": 348 }, { "epoch": 0.5689598956635148, "grad_norm": 0.10108431428670883, "learning_rate": 0.0001, "loss": 0.2133, "step": 349 }, { "epoch": 0.5705901532442126, "grad_norm": 0.10684805363416672, "learning_rate": 0.0001, "loss": 0.2311, "step": 350 }, { "epoch": 0.5722204108249104, "grad_norm": 0.10276418924331665, "learning_rate": 0.0001, "loss": 0.2137, "step": 351 }, { "epoch": 0.573850668405608, "grad_norm": 0.10777677595615387, "learning_rate": 0.0001, "loss": 0.2083, "step": 352 }, { "epoch": 0.5754809259863058, "grad_norm": 0.12655171751976013, "learning_rate": 0.0001, "loss": 0.2269, "step": 353 }, { "epoch": 0.5771111835670036, "grad_norm": 0.11766703426837921, "learning_rate": 0.0001, "loss": 0.2185, "step": 354 }, { "epoch": 0.5787414411477013, "grad_norm": 0.10404328256845474, "learning_rate": 0.0001, "loss": 0.2027, "step": 355 }, { "epoch": 0.5803716987283991, "grad_norm": 0.10919814556837082, "learning_rate": 0.0001, "loss": 0.2025, "step": 356 }, { "epoch": 0.5820019563090968, "grad_norm": 0.12742376327514648, "learning_rate": 0.0001, "loss": 0.2129, "step": 357 }, { "epoch": 0.5836322138897946, "grad_norm": 0.133440762758255, "learning_rate": 0.0001, "loss": 0.2142, "step": 358 }, { "epoch": 0.5852624714704924, "grad_norm": 0.11150208115577698, "learning_rate": 0.0001, "loss": 0.216, "step": 359 }, { "epoch": 0.58689272905119, "grad_norm": 0.12375590205192566, "learning_rate": 0.0001, "loss": 0.2148, "step": 360 }, { "epoch": 0.5885229866318878, "grad_norm": 0.13547080755233765, "learning_rate": 0.0001, "loss": 0.2105, "step": 361 }, { "epoch": 0.5901532442125856, "grad_norm": 0.1400783509016037, "learning_rate": 0.0001, "loss": 0.2314, "step": 362 }, { "epoch": 0.5917835017932833, "grad_norm": 0.11818625032901764, "learning_rate": 0.0001, "loss": 0.2098, "step": 363 }, { "epoch": 0.5934137593739811, "grad_norm": 0.11656329035758972, "learning_rate": 0.0001, "loss": 0.2124, "step": 364 }, { "epoch": 0.5950440169546788, "grad_norm": 0.17707152664661407, "learning_rate": 0.0001, "loss": 0.2004, "step": 365 }, { "epoch": 0.5966742745353766, "grad_norm": 0.15075908601284027, "learning_rate": 0.0001, "loss": 0.2043, "step": 366 }, { "epoch": 0.5983045321160744, "grad_norm": 0.14985023438930511, "learning_rate": 0.0001, "loss": 0.2274, "step": 367 }, { "epoch": 0.599934789696772, "grad_norm": 0.13255703449249268, "learning_rate": 0.0001, "loss": 0.2174, "step": 368 }, { "epoch": 0.6015650472774698, "grad_norm": 0.11377043277025223, "learning_rate": 0.0001, "loss": 0.2285, "step": 369 }, { "epoch": 0.6031953048581676, "grad_norm": 0.10583726316690445, "learning_rate": 0.0001, "loss": 0.2111, "step": 370 }, { "epoch": 0.6048255624388653, "grad_norm": 0.10989916324615479, "learning_rate": 0.0001, "loss": 0.2277, "step": 371 }, { "epoch": 0.6064558200195631, "grad_norm": 0.13273194432258606, "learning_rate": 0.0001, "loss": 0.2269, "step": 372 }, { "epoch": 0.6080860776002608, "grad_norm": 0.12810608744621277, "learning_rate": 0.0001, "loss": 0.2369, "step": 373 }, { "epoch": 0.6097163351809586, "grad_norm": 0.11391692608594894, "learning_rate": 0.0001, "loss": 0.2178, "step": 374 }, { "epoch": 0.6113465927616564, "grad_norm": 0.18068620562553406, "learning_rate": 0.0001, "loss": 0.2257, "step": 375 }, { "epoch": 0.612976850342354, "grad_norm": 0.11202447861433029, "learning_rate": 0.0001, "loss": 0.1979, "step": 376 }, { "epoch": 0.6146071079230518, "grad_norm": 0.13879020512104034, "learning_rate": 0.0001, "loss": 0.2235, "step": 377 }, { "epoch": 0.6162373655037496, "grad_norm": 0.11466659605503082, "learning_rate": 0.0001, "loss": 0.2146, "step": 378 }, { "epoch": 0.6178676230844473, "grad_norm": 0.13301534950733185, "learning_rate": 0.0001, "loss": 0.2374, "step": 379 }, { "epoch": 0.6194978806651451, "grad_norm": 0.10728432238101959, "learning_rate": 0.0001, "loss": 0.2002, "step": 380 }, { "epoch": 0.6211281382458429, "grad_norm": 0.11212770640850067, "learning_rate": 0.0001, "loss": 0.2232, "step": 381 }, { "epoch": 0.6227583958265406, "grad_norm": 0.12755908071994781, "learning_rate": 0.0001, "loss": 0.2284, "step": 382 }, { "epoch": 0.6243886534072384, "grad_norm": 0.11284497380256653, "learning_rate": 0.0001, "loss": 0.2235, "step": 383 }, { "epoch": 0.626018910987936, "grad_norm": 0.13458143174648285, "learning_rate": 0.0001, "loss": 0.2183, "step": 384 }, { "epoch": 0.6276491685686338, "grad_norm": 0.13338471949100494, "learning_rate": 0.0001, "loss": 0.2222, "step": 385 }, { "epoch": 0.6292794261493316, "grad_norm": 0.11197499930858612, "learning_rate": 0.0001, "loss": 0.2184, "step": 386 }, { "epoch": 0.6309096837300293, "grad_norm": 0.11694587767124176, "learning_rate": 0.0001, "loss": 0.2231, "step": 387 }, { "epoch": 0.6325399413107271, "grad_norm": 0.12493859976530075, "learning_rate": 0.0001, "loss": 0.2138, "step": 388 }, { "epoch": 0.6341701988914249, "grad_norm": 0.12846441566944122, "learning_rate": 0.0001, "loss": 0.2244, "step": 389 }, { "epoch": 0.6358004564721226, "grad_norm": 0.11621380597352982, "learning_rate": 0.0001, "loss": 0.2232, "step": 390 }, { "epoch": 0.6374307140528204, "grad_norm": 0.12881864607334137, "learning_rate": 0.0001, "loss": 0.2076, "step": 391 }, { "epoch": 0.639060971633518, "grad_norm": 0.14580029249191284, "learning_rate": 0.0001, "loss": 0.222, "step": 392 }, { "epoch": 0.6406912292142158, "grad_norm": 0.11677811294794083, "learning_rate": 0.0001, "loss": 0.2239, "step": 393 }, { "epoch": 0.6423214867949136, "grad_norm": 0.13098081946372986, "learning_rate": 0.0001, "loss": 0.2217, "step": 394 }, { "epoch": 0.6439517443756113, "grad_norm": 0.13351522386074066, "learning_rate": 0.0001, "loss": 0.2129, "step": 395 }, { "epoch": 0.6455820019563091, "grad_norm": 0.10511767864227295, "learning_rate": 0.0001, "loss": 0.1971, "step": 396 }, { "epoch": 0.6472122595370069, "grad_norm": 0.14180706441402435, "learning_rate": 0.0001, "loss": 0.2195, "step": 397 }, { "epoch": 0.6488425171177046, "grad_norm": 0.1285708099603653, "learning_rate": 0.0001, "loss": 0.2371, "step": 398 }, { "epoch": 0.6504727746984024, "grad_norm": 0.11256030201911926, "learning_rate": 0.0001, "loss": 0.2181, "step": 399 }, { "epoch": 0.6521030322791, "grad_norm": 0.13401979207992554, "learning_rate": 0.0001, "loss": 0.2226, "step": 400 }, { "epoch": 0.6537332898597978, "grad_norm": 0.1150553822517395, "learning_rate": 0.0001, "loss": 0.2307, "step": 401 }, { "epoch": 0.6553635474404956, "grad_norm": 0.12015953660011292, "learning_rate": 0.0001, "loss": 0.2107, "step": 402 }, { "epoch": 0.6569938050211933, "grad_norm": 0.10118319094181061, "learning_rate": 0.0001, "loss": 0.2045, "step": 403 }, { "epoch": 0.6586240626018911, "grad_norm": 0.12295553833246231, "learning_rate": 0.0001, "loss": 0.2206, "step": 404 }, { "epoch": 0.6602543201825889, "grad_norm": 0.11807361245155334, "learning_rate": 0.0001, "loss": 0.2197, "step": 405 }, { "epoch": 0.6618845777632866, "grad_norm": 0.12740248441696167, "learning_rate": 0.0001, "loss": 0.2172, "step": 406 }, { "epoch": 0.6635148353439844, "grad_norm": 0.12042457610368729, "learning_rate": 0.0001, "loss": 0.2225, "step": 407 }, { "epoch": 0.665145092924682, "grad_norm": 0.17038275301456451, "learning_rate": 0.0001, "loss": 0.233, "step": 408 }, { "epoch": 0.6667753505053798, "grad_norm": 0.11151735484600067, "learning_rate": 0.0001, "loss": 0.2013, "step": 409 }, { "epoch": 0.6684056080860776, "grad_norm": 0.12992088496685028, "learning_rate": 0.0001, "loss": 0.2128, "step": 410 }, { "epoch": 0.6700358656667753, "grad_norm": 0.10836778581142426, "learning_rate": 0.0001, "loss": 0.2187, "step": 411 }, { "epoch": 0.6716661232474731, "grad_norm": 0.128456249833107, "learning_rate": 0.0001, "loss": 0.2078, "step": 412 }, { "epoch": 0.6732963808281709, "grad_norm": 0.12796828150749207, "learning_rate": 0.0001, "loss": 0.2222, "step": 413 }, { "epoch": 0.6749266384088686, "grad_norm": 0.11841002106666565, "learning_rate": 0.0001, "loss": 0.2127, "step": 414 }, { "epoch": 0.6765568959895664, "grad_norm": 0.11400352418422699, "learning_rate": 0.0001, "loss": 0.2004, "step": 415 }, { "epoch": 0.6781871535702642, "grad_norm": 0.1325376331806183, "learning_rate": 0.0001, "loss": 0.218, "step": 416 }, { "epoch": 0.6798174111509618, "grad_norm": 0.128141388297081, "learning_rate": 0.0001, "loss": 0.2095, "step": 417 }, { "epoch": 0.6814476687316596, "grad_norm": 0.1291760355234146, "learning_rate": 0.0001, "loss": 0.2231, "step": 418 }, { "epoch": 0.6830779263123573, "grad_norm": 0.12524424493312836, "learning_rate": 0.0001, "loss": 0.2143, "step": 419 }, { "epoch": 0.6847081838930551, "grad_norm": 0.128330260515213, "learning_rate": 0.0001, "loss": 0.199, "step": 420 }, { "epoch": 0.6863384414737529, "grad_norm": 0.12733139097690582, "learning_rate": 0.0001, "loss": 0.2145, "step": 421 }, { "epoch": 0.6879686990544506, "grad_norm": 0.10170670598745346, "learning_rate": 0.0001, "loss": 0.2053, "step": 422 }, { "epoch": 0.6895989566351484, "grad_norm": 0.11686375737190247, "learning_rate": 0.0001, "loss": 0.2193, "step": 423 }, { "epoch": 0.6912292142158462, "grad_norm": 0.1116994172334671, "learning_rate": 0.0001, "loss": 0.22, "step": 424 }, { "epoch": 0.6928594717965438, "grad_norm": 0.10491323471069336, "learning_rate": 0.0001, "loss": 0.2127, "step": 425 }, { "epoch": 0.6944897293772416, "grad_norm": 0.10770369321107864, "learning_rate": 0.0001, "loss": 0.2046, "step": 426 }, { "epoch": 0.6961199869579393, "grad_norm": 0.12084191292524338, "learning_rate": 0.0001, "loss": 0.2174, "step": 427 }, { "epoch": 0.6977502445386371, "grad_norm": 0.12580984830856323, "learning_rate": 0.0001, "loss": 0.2173, "step": 428 }, { "epoch": 0.6993805021193349, "grad_norm": 0.1020190566778183, "learning_rate": 0.0001, "loss": 0.213, "step": 429 }, { "epoch": 0.7010107597000326, "grad_norm": 0.10437814146280289, "learning_rate": 0.0001, "loss": 0.2187, "step": 430 }, { "epoch": 0.7026410172807304, "grad_norm": 0.13456971943378448, "learning_rate": 0.0001, "loss": 0.2153, "step": 431 }, { "epoch": 0.7042712748614282, "grad_norm": 0.12021128088235855, "learning_rate": 0.0001, "loss": 0.2146, "step": 432 }, { "epoch": 0.7059015324421258, "grad_norm": 0.15099307894706726, "learning_rate": 0.0001, "loss": 0.2154, "step": 433 }, { "epoch": 0.7075317900228236, "grad_norm": 0.11958514899015427, "learning_rate": 0.0001, "loss": 0.216, "step": 434 }, { "epoch": 0.7091620476035213, "grad_norm": 0.11153008043766022, "learning_rate": 0.0001, "loss": 0.2034, "step": 435 }, { "epoch": 0.7107923051842191, "grad_norm": 0.14314942061901093, "learning_rate": 0.0001, "loss": 0.2005, "step": 436 }, { "epoch": 0.7124225627649169, "grad_norm": 0.10860710591077805, "learning_rate": 0.0001, "loss": 0.21, "step": 437 }, { "epoch": 0.7140528203456146, "grad_norm": 0.1300295740365982, "learning_rate": 0.0001, "loss": 0.2148, "step": 438 }, { "epoch": 0.7156830779263124, "grad_norm": 0.11765281856060028, "learning_rate": 0.0001, "loss": 0.2144, "step": 439 }, { "epoch": 0.7173133355070102, "grad_norm": 0.1349002867937088, "learning_rate": 0.0001, "loss": 0.2079, "step": 440 }, { "epoch": 0.7189435930877078, "grad_norm": 0.15499438345432281, "learning_rate": 0.0001, "loss": 0.2223, "step": 441 }, { "epoch": 0.7205738506684056, "grad_norm": 0.15518735349178314, "learning_rate": 0.0001, "loss": 0.2317, "step": 442 }, { "epoch": 0.7222041082491033, "grad_norm": 0.16259780526161194, "learning_rate": 0.0001, "loss": 0.2194, "step": 443 }, { "epoch": 0.7238343658298011, "grad_norm": 0.13394394516944885, "learning_rate": 0.0001, "loss": 0.2175, "step": 444 }, { "epoch": 0.7254646234104989, "grad_norm": 0.1212131530046463, "learning_rate": 0.0001, "loss": 0.2172, "step": 445 }, { "epoch": 0.7270948809911966, "grad_norm": 0.1333845555782318, "learning_rate": 0.0001, "loss": 0.2125, "step": 446 }, { "epoch": 0.7287251385718944, "grad_norm": 0.1470658779144287, "learning_rate": 0.0001, "loss": 0.2263, "step": 447 }, { "epoch": 0.7303553961525922, "grad_norm": 0.11653272062540054, "learning_rate": 0.0001, "loss": 0.2203, "step": 448 }, { "epoch": 0.7319856537332898, "grad_norm": 0.11255240440368652, "learning_rate": 0.0001, "loss": 0.2063, "step": 449 }, { "epoch": 0.7336159113139876, "grad_norm": 0.1200360506772995, "learning_rate": 0.0001, "loss": 0.217, "step": 450 }, { "epoch": 0.7352461688946854, "grad_norm": 0.12475258111953735, "learning_rate": 0.0001, "loss": 0.2232, "step": 451 }, { "epoch": 0.7368764264753831, "grad_norm": 0.11277184635400772, "learning_rate": 0.0001, "loss": 0.2314, "step": 452 }, { "epoch": 0.7385066840560809, "grad_norm": 0.11270298063755035, "learning_rate": 0.0001, "loss": 0.2112, "step": 453 }, { "epoch": 0.7401369416367786, "grad_norm": 0.12822334468364716, "learning_rate": 0.0001, "loss": 0.2233, "step": 454 }, { "epoch": 0.7417671992174764, "grad_norm": 0.11773821711540222, "learning_rate": 0.0001, "loss": 0.2213, "step": 455 }, { "epoch": 0.7433974567981741, "grad_norm": 0.12635211646556854, "learning_rate": 0.0001, "loss": 0.2001, "step": 456 }, { "epoch": 0.7450277143788718, "grad_norm": 0.15887145698070526, "learning_rate": 0.0001, "loss": 0.2276, "step": 457 }, { "epoch": 0.7466579719595696, "grad_norm": 0.12271532416343689, "learning_rate": 0.0001, "loss": 0.219, "step": 458 }, { "epoch": 0.7482882295402674, "grad_norm": 0.14966937899589539, "learning_rate": 0.0001, "loss": 0.2119, "step": 459 }, { "epoch": 0.7499184871209651, "grad_norm": 0.15153633058071136, "learning_rate": 0.0001, "loss": 0.2316, "step": 460 }, { "epoch": 0.7515487447016629, "grad_norm": 0.14428603649139404, "learning_rate": 0.0001, "loss": 0.2197, "step": 461 }, { "epoch": 0.7531790022823606, "grad_norm": 0.12772101163864136, "learning_rate": 0.0001, "loss": 0.2159, "step": 462 }, { "epoch": 0.7548092598630584, "grad_norm": 0.12549546360969543, "learning_rate": 0.0001, "loss": 0.2176, "step": 463 }, { "epoch": 0.7564395174437561, "grad_norm": 0.13936737179756165, "learning_rate": 0.0001, "loss": 0.2187, "step": 464 }, { "epoch": 0.7580697750244538, "grad_norm": 0.11733359098434448, "learning_rate": 0.0001, "loss": 0.2013, "step": 465 }, { "epoch": 0.7597000326051516, "grad_norm": 0.14219063520431519, "learning_rate": 0.0001, "loss": 0.2263, "step": 466 }, { "epoch": 0.7613302901858494, "grad_norm": 0.11690539866685867, "learning_rate": 0.0001, "loss": 0.2108, "step": 467 }, { "epoch": 0.7629605477665471, "grad_norm": 0.13221612572669983, "learning_rate": 0.0001, "loss": 0.2051, "step": 468 }, { "epoch": 0.7645908053472449, "grad_norm": 0.14684675633907318, "learning_rate": 0.0001, "loss": 0.2168, "step": 469 }, { "epoch": 0.7662210629279426, "grad_norm": 0.1189989224076271, "learning_rate": 0.0001, "loss": 0.2204, "step": 470 }, { "epoch": 0.7678513205086404, "grad_norm": 0.12807010114192963, "learning_rate": 0.0001, "loss": 0.2285, "step": 471 }, { "epoch": 0.7694815780893381, "grad_norm": 0.13275974988937378, "learning_rate": 0.0001, "loss": 0.219, "step": 472 }, { "epoch": 0.7711118356700358, "grad_norm": 0.1249246746301651, "learning_rate": 0.0001, "loss": 0.2218, "step": 473 }, { "epoch": 0.7727420932507336, "grad_norm": 0.13339447975158691, "learning_rate": 0.0001, "loss": 0.2068, "step": 474 }, { "epoch": 0.7743723508314314, "grad_norm": 0.12550900876522064, "learning_rate": 0.0001, "loss": 0.2103, "step": 475 }, { "epoch": 0.7760026084121291, "grad_norm": 0.11490708589553833, "learning_rate": 0.0001, "loss": 0.2033, "step": 476 }, { "epoch": 0.7776328659928269, "grad_norm": 0.12597325444221497, "learning_rate": 0.0001, "loss": 0.2135, "step": 477 }, { "epoch": 0.7792631235735246, "grad_norm": 0.15443576872348785, "learning_rate": 0.0001, "loss": 0.2133, "step": 478 }, { "epoch": 0.7808933811542224, "grad_norm": 0.19461177289485931, "learning_rate": 0.0001, "loss": 0.1955, "step": 479 }, { "epoch": 0.7825236387349201, "grad_norm": 0.1330460160970688, "learning_rate": 0.0001, "loss": 0.1979, "step": 480 }, { "epoch": 0.7841538963156178, "grad_norm": 0.11665869504213333, "learning_rate": 0.0001, "loss": 0.2149, "step": 481 }, { "epoch": 0.7857841538963156, "grad_norm": 0.09942878782749176, "learning_rate": 0.0001, "loss": 0.2009, "step": 482 }, { "epoch": 0.7874144114770134, "grad_norm": 0.11635838449001312, "learning_rate": 0.0001, "loss": 0.2046, "step": 483 }, { "epoch": 0.7890446690577111, "grad_norm": 0.11658889055252075, "learning_rate": 0.0001, "loss": 0.2154, "step": 484 }, { "epoch": 0.7906749266384089, "grad_norm": 0.122380830347538, "learning_rate": 0.0001, "loss": 0.2003, "step": 485 }, { "epoch": 0.7923051842191067, "grad_norm": 0.11796704679727554, "learning_rate": 0.0001, "loss": 0.1966, "step": 486 }, { "epoch": 0.7939354417998044, "grad_norm": 0.10663303732872009, "learning_rate": 0.0001, "loss": 0.2166, "step": 487 }, { "epoch": 0.7955656993805021, "grad_norm": 0.11099519580602646, "learning_rate": 0.0001, "loss": 0.209, "step": 488 }, { "epoch": 0.7971959569611998, "grad_norm": 0.14217236638069153, "learning_rate": 0.0001, "loss": 0.1955, "step": 489 }, { "epoch": 0.7988262145418976, "grad_norm": 0.10185975581407547, "learning_rate": 0.0001, "loss": 0.194, "step": 490 }, { "epoch": 0.8004564721225954, "grad_norm": 0.11912322789430618, "learning_rate": 0.0001, "loss": 0.1987, "step": 491 }, { "epoch": 0.8020867297032931, "grad_norm": 0.11036073416471481, "learning_rate": 0.0001, "loss": 0.199, "step": 492 }, { "epoch": 0.8037169872839909, "grad_norm": 0.13446538150310516, "learning_rate": 0.0001, "loss": 0.214, "step": 493 }, { "epoch": 0.8053472448646887, "grad_norm": 0.12443588674068451, "learning_rate": 0.0001, "loss": 0.2067, "step": 494 }, { "epoch": 0.8069775024453864, "grad_norm": 0.12884607911109924, "learning_rate": 0.0001, "loss": 0.2093, "step": 495 }, { "epoch": 0.8086077600260841, "grad_norm": 0.1282474249601364, "learning_rate": 0.0001, "loss": 0.1896, "step": 496 }, { "epoch": 0.8102380176067818, "grad_norm": 0.15667201578617096, "learning_rate": 0.0001, "loss": 0.2275, "step": 497 }, { "epoch": 0.8118682751874796, "grad_norm": 0.12083553522825241, "learning_rate": 0.0001, "loss": 0.2061, "step": 498 }, { "epoch": 0.8134985327681774, "grad_norm": 0.1412494033575058, "learning_rate": 0.0001, "loss": 0.2288, "step": 499 }, { "epoch": 0.8151287903488751, "grad_norm": 0.11393098533153534, "learning_rate": 0.0001, "loss": 0.1828, "step": 500 }, { "epoch": 0.8151287903488751, "eval_loss": 0.21177859604358673, "eval_runtime": 2800.3476, "eval_samples_per_second": 0.674, "eval_steps_per_second": 0.169, "step": 500 }, { "epoch": 0.8167590479295729, "grad_norm": 0.14163638651371002, "learning_rate": 0.0001, "loss": 0.2251, "step": 501 }, { "epoch": 0.8183893055102707, "grad_norm": 0.14434993267059326, "learning_rate": 0.0001, "loss": 0.2147, "step": 502 }, { "epoch": 0.8200195630909684, "grad_norm": 0.11927150934934616, "learning_rate": 0.0001, "loss": 0.1887, "step": 503 }, { "epoch": 0.8216498206716661, "grad_norm": 0.12366633117198944, "learning_rate": 0.0001, "loss": 0.1949, "step": 504 }, { "epoch": 0.8232800782523638, "grad_norm": 0.1395760029554367, "learning_rate": 0.0001, "loss": 0.1998, "step": 505 }, { "epoch": 0.8249103358330616, "grad_norm": 0.1172211617231369, "learning_rate": 0.0001, "loss": 0.196, "step": 506 }, { "epoch": 0.8265405934137594, "grad_norm": 0.14511209726333618, "learning_rate": 0.0001, "loss": 0.2301, "step": 507 }, { "epoch": 0.8281708509944571, "grad_norm": 0.17221054434776306, "learning_rate": 0.0001, "loss": 0.2243, "step": 508 }, { "epoch": 0.8298011085751549, "grad_norm": 0.1269037127494812, "learning_rate": 0.0001, "loss": 0.2086, "step": 509 }, { "epoch": 0.8314313661558527, "grad_norm": 0.13336747884750366, "learning_rate": 0.0001, "loss": 0.2085, "step": 510 }, { "epoch": 0.8330616237365503, "grad_norm": 0.13246330618858337, "learning_rate": 0.0001, "loss": 0.2264, "step": 511 }, { "epoch": 0.8346918813172481, "grad_norm": 0.11336122453212738, "learning_rate": 0.0001, "loss": 0.204, "step": 512 }, { "epoch": 0.8363221388979458, "grad_norm": 0.12755842506885529, "learning_rate": 0.0001, "loss": 0.2088, "step": 513 }, { "epoch": 0.8379523964786436, "grad_norm": 0.11611583828926086, "learning_rate": 0.0001, "loss": 0.2037, "step": 514 }, { "epoch": 0.8395826540593414, "grad_norm": 0.09402882307767868, "learning_rate": 0.0001, "loss": 0.1939, "step": 515 }, { "epoch": 0.8412129116400391, "grad_norm": 0.12100663781166077, "learning_rate": 0.0001, "loss": 0.2104, "step": 516 }, { "epoch": 0.8428431692207369, "grad_norm": 0.14289285242557526, "learning_rate": 0.0001, "loss": 0.2216, "step": 517 }, { "epoch": 0.8444734268014347, "grad_norm": 0.10300295054912567, "learning_rate": 0.0001, "loss": 0.1796, "step": 518 }, { "epoch": 0.8461036843821323, "grad_norm": 0.12143992632627487, "learning_rate": 0.0001, "loss": 0.2177, "step": 519 }, { "epoch": 0.8477339419628301, "grad_norm": 0.12010245770215988, "learning_rate": 0.0001, "loss": 0.2016, "step": 520 }, { "epoch": 0.8493641995435279, "grad_norm": 0.1474478542804718, "learning_rate": 0.0001, "loss": 0.2062, "step": 521 }, { "epoch": 0.8509944571242256, "grad_norm": 0.12784548103809357, "learning_rate": 0.0001, "loss": 0.2317, "step": 522 }, { "epoch": 0.8526247147049234, "grad_norm": 0.11985955387353897, "learning_rate": 0.0001, "loss": 0.2196, "step": 523 }, { "epoch": 0.8542549722856211, "grad_norm": 0.11471915990114212, "learning_rate": 0.0001, "loss": 0.2082, "step": 524 }, { "epoch": 0.8558852298663189, "grad_norm": 0.135053813457489, "learning_rate": 0.0001, "loss": 0.215, "step": 525 }, { "epoch": 0.8575154874470167, "grad_norm": 0.10875121504068375, "learning_rate": 0.0001, "loss": 0.2143, "step": 526 }, { "epoch": 0.8591457450277143, "grad_norm": 0.11824619024991989, "learning_rate": 0.0001, "loss": 0.2276, "step": 527 }, { "epoch": 0.8607760026084121, "grad_norm": 0.10967472940683365, "learning_rate": 0.0001, "loss": 0.1933, "step": 528 }, { "epoch": 0.8624062601891099, "grad_norm": 0.09989364445209503, "learning_rate": 0.0001, "loss": 0.2196, "step": 529 }, { "epoch": 0.8640365177698076, "grad_norm": 0.10137049108743668, "learning_rate": 0.0001, "loss": 0.1993, "step": 530 }, { "epoch": 0.8656667753505054, "grad_norm": 0.13142383098602295, "learning_rate": 0.0001, "loss": 0.2221, "step": 531 }, { "epoch": 0.8672970329312031, "grad_norm": 0.1282247006893158, "learning_rate": 0.0001, "loss": 0.2167, "step": 532 }, { "epoch": 0.8689272905119009, "grad_norm": 0.10532195121049881, "learning_rate": 0.0001, "loss": 0.2143, "step": 533 }, { "epoch": 0.8705575480925987, "grad_norm": 0.10713239759206772, "learning_rate": 0.0001, "loss": 0.2045, "step": 534 }, { "epoch": 0.8721878056732963, "grad_norm": 0.12712952494621277, "learning_rate": 0.0001, "loss": 0.2264, "step": 535 }, { "epoch": 0.8738180632539941, "grad_norm": 0.12850171327590942, "learning_rate": 0.0001, "loss": 0.2284, "step": 536 }, { "epoch": 0.8754483208346919, "grad_norm": 0.12360873818397522, "learning_rate": 0.0001, "loss": 0.197, "step": 537 }, { "epoch": 0.8770785784153896, "grad_norm": 0.12052212655544281, "learning_rate": 0.0001, "loss": 0.1906, "step": 538 }, { "epoch": 0.8787088359960874, "grad_norm": 0.14144307374954224, "learning_rate": 0.0001, "loss": 0.2131, "step": 539 }, { "epoch": 0.8803390935767851, "grad_norm": 0.11279332637786865, "learning_rate": 0.0001, "loss": 0.2145, "step": 540 }, { "epoch": 0.8819693511574829, "grad_norm": 0.161105215549469, "learning_rate": 0.0001, "loss": 0.2212, "step": 541 }, { "epoch": 0.8835996087381807, "grad_norm": 0.11746472120285034, "learning_rate": 0.0001, "loss": 0.213, "step": 542 }, { "epoch": 0.8852298663188783, "grad_norm": 0.1244741752743721, "learning_rate": 0.0001, "loss": 0.2065, "step": 543 }, { "epoch": 0.8868601238995761, "grad_norm": 0.1004570797085762, "learning_rate": 0.0001, "loss": 0.2086, "step": 544 }, { "epoch": 0.8884903814802739, "grad_norm": 0.10560411214828491, "learning_rate": 0.0001, "loss": 0.204, "step": 545 }, { "epoch": 0.8901206390609716, "grad_norm": 0.11466600000858307, "learning_rate": 0.0001, "loss": 0.2012, "step": 546 }, { "epoch": 0.8917508966416694, "grad_norm": 0.11946078389883041, "learning_rate": 0.0001, "loss": 0.21, "step": 547 }, { "epoch": 0.8933811542223671, "grad_norm": 0.13734155893325806, "learning_rate": 0.0001, "loss": 0.2166, "step": 548 }, { "epoch": 0.8950114118030649, "grad_norm": 0.11657248437404633, "learning_rate": 0.0001, "loss": 0.2106, "step": 549 }, { "epoch": 0.8966416693837627, "grad_norm": 0.1141151636838913, "learning_rate": 0.0001, "loss": 0.1827, "step": 550 }, { "epoch": 0.8982719269644603, "grad_norm": 0.1335282176733017, "learning_rate": 0.0001, "loss": 0.2069, "step": 551 }, { "epoch": 0.8999021845451581, "grad_norm": 0.12621258199214935, "learning_rate": 0.0001, "loss": 0.1985, "step": 552 }, { "epoch": 0.9015324421258559, "grad_norm": 0.13729962706565857, "learning_rate": 0.0001, "loss": 0.2068, "step": 553 }, { "epoch": 0.9031626997065536, "grad_norm": 0.11198980361223221, "learning_rate": 0.0001, "loss": 0.2078, "step": 554 }, { "epoch": 0.9047929572872514, "grad_norm": 0.12503336369991302, "learning_rate": 0.0001, "loss": 0.2077, "step": 555 }, { "epoch": 0.9064232148679492, "grad_norm": 0.14169982075691223, "learning_rate": 0.0001, "loss": 0.2052, "step": 556 }, { "epoch": 0.9080534724486469, "grad_norm": 0.10698267072439194, "learning_rate": 0.0001, "loss": 0.1958, "step": 557 }, { "epoch": 0.9096837300293447, "grad_norm": 0.1422925740480423, "learning_rate": 0.0001, "loss": 0.1995, "step": 558 }, { "epoch": 0.9113139876100423, "grad_norm": 0.11455567926168442, "learning_rate": 0.0001, "loss": 0.1998, "step": 559 }, { "epoch": 0.9129442451907401, "grad_norm": 0.1599031537771225, "learning_rate": 0.0001, "loss": 0.2113, "step": 560 }, { "epoch": 0.9145745027714379, "grad_norm": 0.13150307536125183, "learning_rate": 0.0001, "loss": 0.217, "step": 561 }, { "epoch": 0.9162047603521356, "grad_norm": 0.10883089154958725, "learning_rate": 0.0001, "loss": 0.1914, "step": 562 }, { "epoch": 0.9178350179328334, "grad_norm": 0.12056294828653336, "learning_rate": 0.0001, "loss": 0.2008, "step": 563 }, { "epoch": 0.9194652755135312, "grad_norm": 0.13235759735107422, "learning_rate": 0.0001, "loss": 0.2019, "step": 564 }, { "epoch": 0.9210955330942289, "grad_norm": 0.11871761828660965, "learning_rate": 0.0001, "loss": 0.2075, "step": 565 }, { "epoch": 0.9227257906749267, "grad_norm": 0.13773424923419952, "learning_rate": 0.0001, "loss": 0.2157, "step": 566 }, { "epoch": 0.9243560482556243, "grad_norm": 0.12058678269386292, "learning_rate": 0.0001, "loss": 0.1872, "step": 567 }, { "epoch": 0.9259863058363221, "grad_norm": 0.12389074265956879, "learning_rate": 0.0001, "loss": 0.2092, "step": 568 }, { "epoch": 0.9276165634170199, "grad_norm": 0.13207301497459412, "learning_rate": 0.0001, "loss": 0.2111, "step": 569 }, { "epoch": 0.9292468209977176, "grad_norm": 0.12088489532470703, "learning_rate": 0.0001, "loss": 0.1886, "step": 570 }, { "epoch": 0.9308770785784154, "grad_norm": 0.1485392153263092, "learning_rate": 0.0001, "loss": 0.2148, "step": 571 }, { "epoch": 0.9325073361591132, "grad_norm": 0.15602104365825653, "learning_rate": 0.0001, "loss": 0.2115, "step": 572 }, { "epoch": 0.9341375937398109, "grad_norm": 0.1451883614063263, "learning_rate": 0.0001, "loss": 0.2147, "step": 573 }, { "epoch": 0.9357678513205087, "grad_norm": 0.15947164595127106, "learning_rate": 0.0001, "loss": 0.2026, "step": 574 }, { "epoch": 0.9373981089012063, "grad_norm": 0.15256251394748688, "learning_rate": 0.0001, "loss": 0.209, "step": 575 }, { "epoch": 0.9390283664819041, "grad_norm": 0.1322648674249649, "learning_rate": 0.0001, "loss": 0.198, "step": 576 }, { "epoch": 0.9406586240626019, "grad_norm": 0.11925152689218521, "learning_rate": 0.0001, "loss": 0.2055, "step": 577 }, { "epoch": 0.9422888816432996, "grad_norm": 0.12666535377502441, "learning_rate": 0.0001, "loss": 0.2063, "step": 578 }, { "epoch": 0.9439191392239974, "grad_norm": 0.10876161605119705, "learning_rate": 0.0001, "loss": 0.2113, "step": 579 }, { "epoch": 0.9455493968046952, "grad_norm": 0.12513647973537445, "learning_rate": 0.0001, "loss": 0.2093, "step": 580 }, { "epoch": 0.9471796543853929, "grad_norm": 0.13541190326213837, "learning_rate": 0.0001, "loss": 0.2048, "step": 581 }, { "epoch": 0.9488099119660907, "grad_norm": 0.12051568925380707, "learning_rate": 0.0001, "loss": 0.2205, "step": 582 }, { "epoch": 0.9504401695467883, "grad_norm": 0.11748453974723816, "learning_rate": 0.0001, "loss": 0.211, "step": 583 }, { "epoch": 0.9520704271274861, "grad_norm": 0.11520763486623764, "learning_rate": 0.0001, "loss": 0.2033, "step": 584 }, { "epoch": 0.9537006847081839, "grad_norm": 0.11155470460653305, "learning_rate": 0.0001, "loss": 0.2004, "step": 585 }, { "epoch": 0.9553309422888816, "grad_norm": 0.12134213000535965, "learning_rate": 0.0001, "loss": 0.2137, "step": 586 }, { "epoch": 0.9569611998695794, "grad_norm": 0.1297822743654251, "learning_rate": 0.0001, "loss": 0.2052, "step": 587 }, { "epoch": 0.9585914574502772, "grad_norm": 0.12923404574394226, "learning_rate": 0.0001, "loss": 0.209, "step": 588 }, { "epoch": 0.9602217150309749, "grad_norm": 0.20898625254631042, "learning_rate": 0.0001, "loss": 0.2047, "step": 589 }, { "epoch": 0.9618519726116727, "grad_norm": 0.12227951735258102, "learning_rate": 0.0001, "loss": 0.2107, "step": 590 }, { "epoch": 0.9634822301923704, "grad_norm": 0.12081098556518555, "learning_rate": 0.0001, "loss": 0.2093, "step": 591 }, { "epoch": 0.9651124877730681, "grad_norm": 0.13986457884311676, "learning_rate": 0.0001, "loss": 0.2161, "step": 592 }, { "epoch": 0.9667427453537659, "grad_norm": 0.12605050206184387, "learning_rate": 0.0001, "loss": 0.2153, "step": 593 }, { "epoch": 0.9683730029344636, "grad_norm": 0.11643191426992416, "learning_rate": 0.0001, "loss": 0.2062, "step": 594 }, { "epoch": 0.9700032605151614, "grad_norm": 0.12521113455295563, "learning_rate": 0.0001, "loss": 0.2121, "step": 595 }, { "epoch": 0.9716335180958592, "grad_norm": 0.11723621189594269, "learning_rate": 0.0001, "loss": 0.2053, "step": 596 }, { "epoch": 0.9732637756765569, "grad_norm": 0.12405609339475632, "learning_rate": 0.0001, "loss": 0.2104, "step": 597 }, { "epoch": 0.9748940332572547, "grad_norm": 0.10823733359575272, "learning_rate": 0.0001, "loss": 0.2174, "step": 598 }, { "epoch": 0.9765242908379524, "grad_norm": 0.11131487786769867, "learning_rate": 0.0001, "loss": 0.213, "step": 599 }, { "epoch": 0.9781545484186501, "grad_norm": 0.13913866877555847, "learning_rate": 0.0001, "loss": 0.2086, "step": 600 }, { "epoch": 0.9797848059993479, "grad_norm": 0.11523352563381195, "learning_rate": 0.0001, "loss": 0.2103, "step": 601 }, { "epoch": 0.9814150635800456, "grad_norm": 0.1258571743965149, "learning_rate": 0.0001, "loss": 0.2082, "step": 602 }, { "epoch": 0.9830453211607434, "grad_norm": 0.14156940579414368, "learning_rate": 0.0001, "loss": 0.2005, "step": 603 }, { "epoch": 0.9846755787414412, "grad_norm": 0.1144745722413063, "learning_rate": 0.0001, "loss": 0.218, "step": 604 }, { "epoch": 0.9863058363221389, "grad_norm": 0.12323161959648132, "learning_rate": 0.0001, "loss": 0.2007, "step": 605 }, { "epoch": 0.9879360939028367, "grad_norm": 0.1369849145412445, "learning_rate": 0.0001, "loss": 0.2187, "step": 606 }, { "epoch": 0.9895663514835344, "grad_norm": 0.13043704628944397, "learning_rate": 0.0001, "loss": 0.2111, "step": 607 }, { "epoch": 0.9911966090642321, "grad_norm": 0.11909907311201096, "learning_rate": 0.0001, "loss": 0.2054, "step": 608 }, { "epoch": 0.9928268666449299, "grad_norm": 0.12159440666437149, "learning_rate": 0.0001, "loss": 0.2168, "step": 609 }, { "epoch": 0.9944571242256276, "grad_norm": 0.11479892581701279, "learning_rate": 0.0001, "loss": 0.2116, "step": 610 }, { "epoch": 0.9960873818063254, "grad_norm": 0.12101448327302933, "learning_rate": 0.0001, "loss": 0.2054, "step": 611 }, { "epoch": 0.9977176393870232, "grad_norm": 0.1083630621433258, "learning_rate": 0.0001, "loss": 0.1981, "step": 612 }, { "epoch": 0.9993478969677209, "grad_norm": 0.09980759769678116, "learning_rate": 0.0001, "loss": 0.2045, "step": 613 }, { "epoch": 1.0009781545484187, "grad_norm": 0.1311112940311432, "learning_rate": 0.0001, "loss": 0.2099, "step": 614 }, { "epoch": 1.0026084121291163, "grad_norm": 0.12701699137687683, "learning_rate": 0.0001, "loss": 0.1989, "step": 615 }, { "epoch": 1.0042386697098142, "grad_norm": 0.10910508036613464, "learning_rate": 0.0001, "loss": 0.2063, "step": 616 }, { "epoch": 1.005868927290512, "grad_norm": 0.11964251101016998, "learning_rate": 0.0001, "loss": 0.2046, "step": 617 }, { "epoch": 1.0074991848712096, "grad_norm": 0.11345476657152176, "learning_rate": 0.0001, "loss": 0.1933, "step": 618 }, { "epoch": 1.0091294424519075, "grad_norm": 0.11260963976383209, "learning_rate": 0.0001, "loss": 0.2019, "step": 619 }, { "epoch": 1.0107597000326052, "grad_norm": 0.13378599286079407, "learning_rate": 0.0001, "loss": 0.2074, "step": 620 }, { "epoch": 1.0123899576133029, "grad_norm": 0.10661476850509644, "learning_rate": 0.0001, "loss": 0.1972, "step": 621 }, { "epoch": 1.0140202151940008, "grad_norm": 0.12620778381824493, "learning_rate": 0.0001, "loss": 0.2111, "step": 622 }, { "epoch": 1.0156504727746984, "grad_norm": 0.12133750319480896, "learning_rate": 0.0001, "loss": 0.2022, "step": 623 }, { "epoch": 1.0172807303553961, "grad_norm": 0.10500383377075195, "learning_rate": 0.0001, "loss": 0.1778, "step": 624 }, { "epoch": 1.0189109879360938, "grad_norm": 0.12143319845199585, "learning_rate": 0.0001, "loss": 0.211, "step": 625 }, { "epoch": 1.0205412455167917, "grad_norm": 0.11858654022216797, "learning_rate": 0.0001, "loss": 0.1871, "step": 626 }, { "epoch": 1.0221715030974894, "grad_norm": 0.12005385756492615, "learning_rate": 0.0001, "loss": 0.2077, "step": 627 }, { "epoch": 1.023801760678187, "grad_norm": 0.13842667639255524, "learning_rate": 0.0001, "loss": 0.2263, "step": 628 }, { "epoch": 1.025432018258885, "grad_norm": 0.14517144858837128, "learning_rate": 0.0001, "loss": 0.2117, "step": 629 }, { "epoch": 1.0270622758395827, "grad_norm": 0.12258317321538925, "learning_rate": 0.0001, "loss": 0.1921, "step": 630 }, { "epoch": 1.0286925334202803, "grad_norm": 0.10833742469549179, "learning_rate": 0.0001, "loss": 0.1868, "step": 631 }, { "epoch": 1.0303227910009782, "grad_norm": 0.15066489577293396, "learning_rate": 0.0001, "loss": 0.206, "step": 632 }, { "epoch": 1.031953048581676, "grad_norm": 0.11644082516431808, "learning_rate": 0.0001, "loss": 0.2128, "step": 633 }, { "epoch": 1.0335833061623736, "grad_norm": 0.1423221379518509, "learning_rate": 0.0001, "loss": 0.1998, "step": 634 }, { "epoch": 1.0352135637430715, "grad_norm": 0.12416679412126541, "learning_rate": 0.0001, "loss": 0.1915, "step": 635 }, { "epoch": 1.0368438213237692, "grad_norm": 0.11678506433963776, "learning_rate": 0.0001, "loss": 0.189, "step": 636 }, { "epoch": 1.0384740789044669, "grad_norm": 0.12458233535289764, "learning_rate": 0.0001, "loss": 0.1911, "step": 637 }, { "epoch": 1.0401043364851648, "grad_norm": 0.12417783588171005, "learning_rate": 0.0001, "loss": 0.1773, "step": 638 }, { "epoch": 1.0417345940658624, "grad_norm": 0.11989506334066391, "learning_rate": 0.0001, "loss": 0.1932, "step": 639 }, { "epoch": 1.0433648516465601, "grad_norm": 0.1083277091383934, "learning_rate": 0.0001, "loss": 0.1952, "step": 640 }, { "epoch": 1.0449951092272578, "grad_norm": 0.17112882435321808, "learning_rate": 0.0001, "loss": 0.2021, "step": 641 }, { "epoch": 1.0466253668079557, "grad_norm": 0.14799998700618744, "learning_rate": 0.0001, "loss": 0.2038, "step": 642 }, { "epoch": 1.0482556243886534, "grad_norm": 0.14156191051006317, "learning_rate": 0.0001, "loss": 0.1908, "step": 643 }, { "epoch": 1.049885881969351, "grad_norm": 0.1254645138978958, "learning_rate": 0.0001, "loss": 0.2045, "step": 644 }, { "epoch": 1.051516139550049, "grad_norm": 0.11081767827272415, "learning_rate": 0.0001, "loss": 0.195, "step": 645 }, { "epoch": 1.0531463971307466, "grad_norm": 0.11810845881700516, "learning_rate": 0.0001, "loss": 0.1939, "step": 646 }, { "epoch": 1.0547766547114443, "grad_norm": 0.12629976868629456, "learning_rate": 0.0001, "loss": 0.2117, "step": 647 }, { "epoch": 1.0564069122921422, "grad_norm": 0.11882951855659485, "learning_rate": 0.0001, "loss": 0.192, "step": 648 }, { "epoch": 1.05803716987284, "grad_norm": 0.11729996651411057, "learning_rate": 0.0001, "loss": 0.1916, "step": 649 }, { "epoch": 1.0596674274535376, "grad_norm": 0.11437559127807617, "learning_rate": 0.0001, "loss": 0.1885, "step": 650 }, { "epoch": 1.0612976850342355, "grad_norm": 0.13408319652080536, "learning_rate": 0.0001, "loss": 0.2138, "step": 651 }, { "epoch": 1.0629279426149332, "grad_norm": 0.14032766222953796, "learning_rate": 0.0001, "loss": 0.205, "step": 652 }, { "epoch": 1.0645582001956309, "grad_norm": 0.1438027173280716, "learning_rate": 0.0001, "loss": 0.1934, "step": 653 }, { "epoch": 1.0661884577763288, "grad_norm": 0.11743790656328201, "learning_rate": 0.0001, "loss": 0.1944, "step": 654 }, { "epoch": 1.0678187153570264, "grad_norm": 0.1314900517463684, "learning_rate": 0.0001, "loss": 0.2108, "step": 655 }, { "epoch": 1.0694489729377241, "grad_norm": 0.11884745210409164, "learning_rate": 0.0001, "loss": 0.1858, "step": 656 }, { "epoch": 1.0710792305184218, "grad_norm": 0.10163510590791702, "learning_rate": 0.0001, "loss": 0.1984, "step": 657 }, { "epoch": 1.0727094880991197, "grad_norm": 0.11893923580646515, "learning_rate": 0.0001, "loss": 0.1999, "step": 658 }, { "epoch": 1.0743397456798174, "grad_norm": 0.12390507012605667, "learning_rate": 0.0001, "loss": 0.2142, "step": 659 }, { "epoch": 1.075970003260515, "grad_norm": 0.11425681412220001, "learning_rate": 0.0001, "loss": 0.1883, "step": 660 }, { "epoch": 1.077600260841213, "grad_norm": 0.11586383730173111, "learning_rate": 0.0001, "loss": 0.1894, "step": 661 }, { "epoch": 1.0792305184219106, "grad_norm": 0.1234535500407219, "learning_rate": 0.0001, "loss": 0.2, "step": 662 }, { "epoch": 1.0808607760026083, "grad_norm": 0.107200987637043, "learning_rate": 0.0001, "loss": 0.1875, "step": 663 }, { "epoch": 1.0824910335833062, "grad_norm": 0.12569493055343628, "learning_rate": 0.0001, "loss": 0.1936, "step": 664 }, { "epoch": 1.084121291164004, "grad_norm": 0.12856988608837128, "learning_rate": 0.0001, "loss": 0.1905, "step": 665 }, { "epoch": 1.0857515487447016, "grad_norm": 0.16755834221839905, "learning_rate": 0.0001, "loss": 0.2023, "step": 666 }, { "epoch": 1.0873818063253995, "grad_norm": 0.13660454750061035, "learning_rate": 0.0001, "loss": 0.2032, "step": 667 }, { "epoch": 1.0890120639060972, "grad_norm": 0.10664553940296173, "learning_rate": 0.0001, "loss": 0.1725, "step": 668 }, { "epoch": 1.0906423214867949, "grad_norm": 0.1499480903148651, "learning_rate": 0.0001, "loss": 0.2057, "step": 669 }, { "epoch": 1.0922725790674928, "grad_norm": 0.13117225468158722, "learning_rate": 0.0001, "loss": 0.2064, "step": 670 }, { "epoch": 1.0939028366481904, "grad_norm": 0.12112154811620712, "learning_rate": 0.0001, "loss": 0.1952, "step": 671 }, { "epoch": 1.0955330942288881, "grad_norm": 0.12471124529838562, "learning_rate": 0.0001, "loss": 0.2005, "step": 672 }, { "epoch": 1.097163351809586, "grad_norm": 0.14131243526935577, "learning_rate": 0.0001, "loss": 0.2099, "step": 673 }, { "epoch": 1.0987936093902837, "grad_norm": 0.1315171718597412, "learning_rate": 0.0001, "loss": 0.1955, "step": 674 }, { "epoch": 1.1004238669709814, "grad_norm": 0.12525959312915802, "learning_rate": 0.0001, "loss": 0.1937, "step": 675 }, { "epoch": 1.102054124551679, "grad_norm": 0.12638898193836212, "learning_rate": 0.0001, "loss": 0.2045, "step": 676 }, { "epoch": 1.103684382132377, "grad_norm": 0.12964749336242676, "learning_rate": 0.0001, "loss": 0.1799, "step": 677 }, { "epoch": 1.1053146397130746, "grad_norm": 0.14780597388744354, "learning_rate": 0.0001, "loss": 0.1987, "step": 678 }, { "epoch": 1.1069448972937723, "grad_norm": 0.14882171154022217, "learning_rate": 0.0001, "loss": 0.2038, "step": 679 }, { "epoch": 1.1085751548744702, "grad_norm": 0.131666362285614, "learning_rate": 0.0001, "loss": 0.1915, "step": 680 }, { "epoch": 1.110205412455168, "grad_norm": 0.12266353517770767, "learning_rate": 0.0001, "loss": 0.19, "step": 681 }, { "epoch": 1.1118356700358656, "grad_norm": 0.13120290637016296, "learning_rate": 0.0001, "loss": 0.2028, "step": 682 }, { "epoch": 1.1134659276165635, "grad_norm": 0.12845005095005035, "learning_rate": 0.0001, "loss": 0.202, "step": 683 }, { "epoch": 1.1150961851972612, "grad_norm": 0.12827853858470917, "learning_rate": 0.0001, "loss": 0.2101, "step": 684 }, { "epoch": 1.1167264427779588, "grad_norm": 0.10449423640966415, "learning_rate": 0.0001, "loss": 0.1814, "step": 685 }, { "epoch": 1.1183567003586568, "grad_norm": 0.12488257884979248, "learning_rate": 0.0001, "loss": 0.2008, "step": 686 }, { "epoch": 1.1199869579393544, "grad_norm": 0.1401628851890564, "learning_rate": 0.0001, "loss": 0.2046, "step": 687 }, { "epoch": 1.1216172155200521, "grad_norm": 0.11075824499130249, "learning_rate": 0.0001, "loss": 0.1908, "step": 688 }, { "epoch": 1.12324747310075, "grad_norm": 0.14995628595352173, "learning_rate": 0.0001, "loss": 0.1993, "step": 689 }, { "epoch": 1.1248777306814477, "grad_norm": 0.13028737902641296, "learning_rate": 0.0001, "loss": 0.2035, "step": 690 }, { "epoch": 1.1265079882621454, "grad_norm": 0.12029106914997101, "learning_rate": 0.0001, "loss": 0.166, "step": 691 }, { "epoch": 1.1281382458428433, "grad_norm": 0.11109007149934769, "learning_rate": 0.0001, "loss": 0.1921, "step": 692 }, { "epoch": 1.129768503423541, "grad_norm": 0.112598717212677, "learning_rate": 0.0001, "loss": 0.1849, "step": 693 }, { "epoch": 1.1313987610042386, "grad_norm": 0.11764469742774963, "learning_rate": 0.0001, "loss": 0.1979, "step": 694 }, { "epoch": 1.1330290185849363, "grad_norm": 0.13900335133075714, "learning_rate": 0.0001, "loss": 0.1949, "step": 695 }, { "epoch": 1.1346592761656342, "grad_norm": 0.11250978708267212, "learning_rate": 0.0001, "loss": 0.193, "step": 696 }, { "epoch": 1.136289533746332, "grad_norm": 0.13568443059921265, "learning_rate": 0.0001, "loss": 0.1858, "step": 697 }, { "epoch": 1.1379197913270296, "grad_norm": 0.15527978539466858, "learning_rate": 0.0001, "loss": 0.1957, "step": 698 }, { "epoch": 1.1395500489077275, "grad_norm": 0.12831154465675354, "learning_rate": 0.0001, "loss": 0.19, "step": 699 }, { "epoch": 1.1411803064884252, "grad_norm": 0.1345253884792328, "learning_rate": 0.0001, "loss": 0.1916, "step": 700 }, { "epoch": 1.1428105640691228, "grad_norm": 0.13202865421772003, "learning_rate": 0.0001, "loss": 0.187, "step": 701 }, { "epoch": 1.1444408216498207, "grad_norm": 0.12138167768716812, "learning_rate": 0.0001, "loss": 0.1923, "step": 702 }, { "epoch": 1.1460710792305184, "grad_norm": 0.13008633255958557, "learning_rate": 0.0001, "loss": 0.2001, "step": 703 }, { "epoch": 1.147701336811216, "grad_norm": 0.13472393155097961, "learning_rate": 0.0001, "loss": 0.1965, "step": 704 }, { "epoch": 1.149331594391914, "grad_norm": 0.13745389878749847, "learning_rate": 0.0001, "loss": 0.1929, "step": 705 }, { "epoch": 1.1509618519726117, "grad_norm": 0.13292807340621948, "learning_rate": 0.0001, "loss": 0.2109, "step": 706 }, { "epoch": 1.1525921095533094, "grad_norm": 0.1446523666381836, "learning_rate": 0.0001, "loss": 0.1948, "step": 707 }, { "epoch": 1.154222367134007, "grad_norm": 0.1305355578660965, "learning_rate": 0.0001, "loss": 0.187, "step": 708 }, { "epoch": 1.155852624714705, "grad_norm": 0.11248882114887238, "learning_rate": 0.0001, "loss": 0.1878, "step": 709 }, { "epoch": 1.1574828822954026, "grad_norm": 0.11229637265205383, "learning_rate": 0.0001, "loss": 0.1864, "step": 710 }, { "epoch": 1.1591131398761005, "grad_norm": 0.13594146072864532, "learning_rate": 0.0001, "loss": 0.2021, "step": 711 }, { "epoch": 1.1607433974567982, "grad_norm": 0.10707145929336548, "learning_rate": 0.0001, "loss": 0.1841, "step": 712 }, { "epoch": 1.162373655037496, "grad_norm": 0.13550283014774323, "learning_rate": 0.0001, "loss": 0.1909, "step": 713 }, { "epoch": 1.1640039126181936, "grad_norm": 0.16137608885765076, "learning_rate": 0.0001, "loss": 0.1904, "step": 714 }, { "epoch": 1.1656341701988915, "grad_norm": 0.1217959076166153, "learning_rate": 0.0001, "loss": 0.1652, "step": 715 }, { "epoch": 1.1672644277795892, "grad_norm": 0.12333795428276062, "learning_rate": 0.0001, "loss": 0.1774, "step": 716 }, { "epoch": 1.1688946853602868, "grad_norm": 0.13329114019870758, "learning_rate": 0.0001, "loss": 0.1945, "step": 717 }, { "epoch": 1.1705249429409847, "grad_norm": 0.13846194744110107, "learning_rate": 0.0001, "loss": 0.2064, "step": 718 }, { "epoch": 1.1721552005216824, "grad_norm": 0.12300273030996323, "learning_rate": 0.0001, "loss": 0.1854, "step": 719 }, { "epoch": 1.17378545810238, "grad_norm": 0.13478830456733704, "learning_rate": 0.0001, "loss": 0.192, "step": 720 }, { "epoch": 1.175415715683078, "grad_norm": 0.12126126885414124, "learning_rate": 0.0001, "loss": 0.1976, "step": 721 }, { "epoch": 1.1770459732637757, "grad_norm": 0.12510600686073303, "learning_rate": 0.0001, "loss": 0.2058, "step": 722 }, { "epoch": 1.1786762308444734, "grad_norm": 0.1227242723107338, "learning_rate": 0.0001, "loss": 0.167, "step": 723 }, { "epoch": 1.1803064884251713, "grad_norm": 0.15479400753974915, "learning_rate": 0.0001, "loss": 0.2009, "step": 724 }, { "epoch": 1.181936746005869, "grad_norm": 0.1210135892033577, "learning_rate": 0.0001, "loss": 0.2009, "step": 725 }, { "epoch": 1.1835670035865666, "grad_norm": 0.15612360835075378, "learning_rate": 0.0001, "loss": 0.191, "step": 726 }, { "epoch": 1.1851972611672643, "grad_norm": 0.20443665981292725, "learning_rate": 0.0001, "loss": 0.1861, "step": 727 }, { "epoch": 1.1868275187479622, "grad_norm": 0.13557809591293335, "learning_rate": 0.0001, "loss": 0.2098, "step": 728 }, { "epoch": 1.18845777632866, "grad_norm": 0.14073556661605835, "learning_rate": 0.0001, "loss": 0.2046, "step": 729 }, { "epoch": 1.1900880339093578, "grad_norm": 0.11344654113054276, "learning_rate": 0.0001, "loss": 0.1878, "step": 730 }, { "epoch": 1.1917182914900555, "grad_norm": 0.12960317730903625, "learning_rate": 0.0001, "loss": 0.2068, "step": 731 }, { "epoch": 1.1933485490707532, "grad_norm": 0.11689286679029465, "learning_rate": 0.0001, "loss": 0.1806, "step": 732 }, { "epoch": 1.1949788066514508, "grad_norm": 0.12490563094615936, "learning_rate": 0.0001, "loss": 0.2013, "step": 733 }, { "epoch": 1.1966090642321487, "grad_norm": 0.13222745060920715, "learning_rate": 0.0001, "loss": 0.1923, "step": 734 }, { "epoch": 1.1982393218128464, "grad_norm": 0.14103910326957703, "learning_rate": 0.0001, "loss": 0.1973, "step": 735 }, { "epoch": 1.199869579393544, "grad_norm": 0.1409367471933365, "learning_rate": 0.0001, "loss": 0.1971, "step": 736 }, { "epoch": 1.201499836974242, "grad_norm": 0.13670028746128082, "learning_rate": 0.0001, "loss": 0.19, "step": 737 }, { "epoch": 1.2031300945549397, "grad_norm": 0.11378346383571625, "learning_rate": 0.0001, "loss": 0.1846, "step": 738 }, { "epoch": 1.2047603521356374, "grad_norm": 0.12771202623844147, "learning_rate": 0.0001, "loss": 0.1952, "step": 739 }, { "epoch": 1.2063906097163353, "grad_norm": 0.1480347216129303, "learning_rate": 0.0001, "loss": 0.2065, "step": 740 }, { "epoch": 1.208020867297033, "grad_norm": 0.14294308423995972, "learning_rate": 0.0001, "loss": 0.199, "step": 741 }, { "epoch": 1.2096511248777306, "grad_norm": 0.11501043289899826, "learning_rate": 0.0001, "loss": 0.1888, "step": 742 }, { "epoch": 1.2112813824584285, "grad_norm": 0.12315449863672256, "learning_rate": 0.0001, "loss": 0.1999, "step": 743 }, { "epoch": 1.2129116400391262, "grad_norm": 0.11507046967744827, "learning_rate": 0.0001, "loss": 0.2075, "step": 744 }, { "epoch": 1.214541897619824, "grad_norm": 0.11354836076498032, "learning_rate": 0.0001, "loss": 0.1861, "step": 745 }, { "epoch": 1.2161721552005216, "grad_norm": 0.13437440991401672, "learning_rate": 0.0001, "loss": 0.1985, "step": 746 }, { "epoch": 1.2178024127812195, "grad_norm": 0.11698954552412033, "learning_rate": 0.0001, "loss": 0.1959, "step": 747 }, { "epoch": 1.2194326703619172, "grad_norm": 0.12441843748092651, "learning_rate": 0.0001, "loss": 0.1955, "step": 748 }, { "epoch": 1.2210629279426148, "grad_norm": 1.1128816604614258, "learning_rate": 0.0001, "loss": 0.2114, "step": 749 }, { "epoch": 1.2226931855233127, "grad_norm": 0.1364130824804306, "learning_rate": 0.0001, "loss": 0.1847, "step": 750 }, { "epoch": 1.2243234431040104, "grad_norm": 0.14699682593345642, "learning_rate": 0.0001, "loss": 0.1955, "step": 751 }, { "epoch": 1.225953700684708, "grad_norm": 0.16500285267829895, "learning_rate": 0.0001, "loss": 0.2026, "step": 752 }, { "epoch": 1.227583958265406, "grad_norm": 0.12731170654296875, "learning_rate": 0.0001, "loss": 0.1935, "step": 753 }, { "epoch": 1.2292142158461037, "grad_norm": 0.12876448035240173, "learning_rate": 0.0001, "loss": 0.2066, "step": 754 }, { "epoch": 1.2308444734268014, "grad_norm": 0.14302317798137665, "learning_rate": 0.0001, "loss": 0.1946, "step": 755 }, { "epoch": 1.2324747310074993, "grad_norm": 0.14912651479244232, "learning_rate": 0.0001, "loss": 0.1836, "step": 756 }, { "epoch": 1.234104988588197, "grad_norm": 0.1160784587264061, "learning_rate": 0.0001, "loss": 0.1853, "step": 757 }, { "epoch": 1.2357352461688946, "grad_norm": 0.11871147900819778, "learning_rate": 0.0001, "loss": 0.1845, "step": 758 }, { "epoch": 1.2373655037495925, "grad_norm": 0.14770323038101196, "learning_rate": 0.0001, "loss": 0.2033, "step": 759 }, { "epoch": 1.2389957613302902, "grad_norm": 0.2331717312335968, "learning_rate": 0.0001, "loss": 0.188, "step": 760 }, { "epoch": 1.240626018910988, "grad_norm": 0.12382549792528152, "learning_rate": 0.0001, "loss": 0.1991, "step": 761 }, { "epoch": 1.2422562764916858, "grad_norm": 0.16363592445850372, "learning_rate": 0.0001, "loss": 0.1922, "step": 762 }, { "epoch": 1.2438865340723835, "grad_norm": 0.10923486948013306, "learning_rate": 0.0001, "loss": 0.1951, "step": 763 }, { "epoch": 1.2455167916530812, "grad_norm": 0.15710005164146423, "learning_rate": 0.0001, "loss": 0.2105, "step": 764 }, { "epoch": 1.2471470492337788, "grad_norm": 0.1298326998949051, "learning_rate": 0.0001, "loss": 0.1957, "step": 765 }, { "epoch": 1.2487773068144767, "grad_norm": 0.12656745314598083, "learning_rate": 0.0001, "loss": 0.1908, "step": 766 }, { "epoch": 1.2504075643951744, "grad_norm": 0.23991864919662476, "learning_rate": 0.0001, "loss": 0.2039, "step": 767 }, { "epoch": 1.2520378219758723, "grad_norm": 0.12446881830692291, "learning_rate": 0.0001, "loss": 0.2051, "step": 768 }, { "epoch": 1.25366807955657, "grad_norm": 0.13822287321090698, "learning_rate": 0.0001, "loss": 0.2118, "step": 769 }, { "epoch": 1.2552983371372677, "grad_norm": 0.1322907656431198, "learning_rate": 0.0001, "loss": 0.2043, "step": 770 }, { "epoch": 1.2569285947179654, "grad_norm": 0.13346537947654724, "learning_rate": 0.0001, "loss": 0.1907, "step": 771 }, { "epoch": 1.2585588522986633, "grad_norm": 0.133008673787117, "learning_rate": 0.0001, "loss": 0.1957, "step": 772 }, { "epoch": 1.260189109879361, "grad_norm": 0.13915961980819702, "learning_rate": 0.0001, "loss": 0.1996, "step": 773 }, { "epoch": 1.2618193674600586, "grad_norm": 0.12022515386343002, "learning_rate": 0.0001, "loss": 0.2048, "step": 774 }, { "epoch": 1.2634496250407565, "grad_norm": 0.12311067432165146, "learning_rate": 0.0001, "loss": 0.2076, "step": 775 }, { "epoch": 1.2650798826214542, "grad_norm": 0.11143454164266586, "learning_rate": 0.0001, "loss": 0.192, "step": 776 }, { "epoch": 1.266710140202152, "grad_norm": 0.11484785377979279, "learning_rate": 0.0001, "loss": 0.1797, "step": 777 }, { "epoch": 1.2683403977828496, "grad_norm": 0.12231987714767456, "learning_rate": 0.0001, "loss": 0.2006, "step": 778 }, { "epoch": 1.2699706553635475, "grad_norm": 0.138115793466568, "learning_rate": 0.0001, "loss": 0.1953, "step": 779 }, { "epoch": 1.2716009129442452, "grad_norm": 0.1262485831975937, "learning_rate": 0.0001, "loss": 0.2104, "step": 780 }, { "epoch": 1.273231170524943, "grad_norm": 0.10364367067813873, "learning_rate": 0.0001, "loss": 0.1913, "step": 781 }, { "epoch": 1.2748614281056407, "grad_norm": 0.11666619032621384, "learning_rate": 0.0001, "loss": 0.2092, "step": 782 }, { "epoch": 1.2764916856863384, "grad_norm": 0.10511959344148636, "learning_rate": 0.0001, "loss": 0.1821, "step": 783 }, { "epoch": 1.278121943267036, "grad_norm": 0.12640027701854706, "learning_rate": 0.0001, "loss": 0.1969, "step": 784 }, { "epoch": 1.279752200847734, "grad_norm": 0.13358190655708313, "learning_rate": 0.0001, "loss": 0.1874, "step": 785 }, { "epoch": 1.2813824584284317, "grad_norm": 0.13797491788864136, "learning_rate": 0.0001, "loss": 0.2039, "step": 786 }, { "epoch": 1.2830127160091294, "grad_norm": 0.16730040311813354, "learning_rate": 0.0001, "loss": 0.1812, "step": 787 }, { "epoch": 1.2846429735898273, "grad_norm": 0.13052508234977722, "learning_rate": 0.0001, "loss": 0.2029, "step": 788 }, { "epoch": 1.286273231170525, "grad_norm": 0.11100683361291885, "learning_rate": 0.0001, "loss": 0.1756, "step": 789 }, { "epoch": 1.2879034887512226, "grad_norm": 0.12207093834877014, "learning_rate": 0.0001, "loss": 0.1951, "step": 790 }, { "epoch": 1.2895337463319203, "grad_norm": 0.10500862449407578, "learning_rate": 0.0001, "loss": 0.187, "step": 791 }, { "epoch": 1.2911640039126182, "grad_norm": 0.12395953387022018, "learning_rate": 0.0001, "loss": 0.1789, "step": 792 }, { "epoch": 1.2927942614933159, "grad_norm": 0.12728352844715118, "learning_rate": 0.0001, "loss": 0.2055, "step": 793 }, { "epoch": 1.2944245190740138, "grad_norm": 0.12210983037948608, "learning_rate": 0.0001, "loss": 0.1848, "step": 794 }, { "epoch": 1.2960547766547115, "grad_norm": 0.1309320330619812, "learning_rate": 0.0001, "loss": 0.2019, "step": 795 }, { "epoch": 1.2976850342354092, "grad_norm": 0.14810331165790558, "learning_rate": 0.0001, "loss": 0.1983, "step": 796 }, { "epoch": 1.2993152918161068, "grad_norm": 0.11592376232147217, "learning_rate": 0.0001, "loss": 0.1839, "step": 797 }, { "epoch": 1.3009455493968047, "grad_norm": 0.12396306544542313, "learning_rate": 0.0001, "loss": 0.1964, "step": 798 }, { "epoch": 1.3025758069775024, "grad_norm": 0.13260740041732788, "learning_rate": 0.0001, "loss": 0.2012, "step": 799 }, { "epoch": 1.3042060645582003, "grad_norm": 0.13183878362178802, "learning_rate": 0.0001, "loss": 0.2066, "step": 800 }, { "epoch": 1.305836322138898, "grad_norm": 0.12593010067939758, "learning_rate": 0.0001, "loss": 0.1916, "step": 801 }, { "epoch": 1.3074665797195957, "grad_norm": 0.11641526967287064, "learning_rate": 0.0001, "loss": 0.1847, "step": 802 }, { "epoch": 1.3090968373002934, "grad_norm": 0.1082523912191391, "learning_rate": 0.0001, "loss": 0.1816, "step": 803 }, { "epoch": 1.3107270948809913, "grad_norm": 0.11721451580524445, "learning_rate": 0.0001, "loss": 0.1913, "step": 804 }, { "epoch": 1.312357352461689, "grad_norm": 0.14624540507793427, "learning_rate": 0.0001, "loss": 0.2057, "step": 805 }, { "epoch": 1.3139876100423866, "grad_norm": 0.11817949265241623, "learning_rate": 0.0001, "loss": 0.1876, "step": 806 }, { "epoch": 1.3156178676230845, "grad_norm": 0.12821592390537262, "learning_rate": 0.0001, "loss": 0.2099, "step": 807 }, { "epoch": 1.3172481252037822, "grad_norm": 0.13968628644943237, "learning_rate": 0.0001, "loss": 0.2101, "step": 808 }, { "epoch": 1.3188783827844799, "grad_norm": 0.1454535573720932, "learning_rate": 0.0001, "loss": 0.2062, "step": 809 }, { "epoch": 1.3205086403651776, "grad_norm": 0.1232244223356247, "learning_rate": 0.0001, "loss": 0.1775, "step": 810 }, { "epoch": 1.3221388979458755, "grad_norm": 0.12356823682785034, "learning_rate": 0.0001, "loss": 0.1945, "step": 811 }, { "epoch": 1.3237691555265731, "grad_norm": 0.1321604698896408, "learning_rate": 0.0001, "loss": 0.1956, "step": 812 }, { "epoch": 1.325399413107271, "grad_norm": 0.13686218857765198, "learning_rate": 0.0001, "loss": 0.1997, "step": 813 }, { "epoch": 1.3270296706879687, "grad_norm": 0.1265021115541458, "learning_rate": 0.0001, "loss": 0.2003, "step": 814 }, { "epoch": 1.3286599282686664, "grad_norm": 0.13822129368782043, "learning_rate": 0.0001, "loss": 0.2037, "step": 815 }, { "epoch": 1.330290185849364, "grad_norm": 0.1474665403366089, "learning_rate": 0.0001, "loss": 0.2014, "step": 816 }, { "epoch": 1.331920443430062, "grad_norm": 0.1312236338853836, "learning_rate": 0.0001, "loss": 0.2047, "step": 817 }, { "epoch": 1.3335507010107597, "grad_norm": 0.10650108009576797, "learning_rate": 0.0001, "loss": 0.1774, "step": 818 }, { "epoch": 1.3351809585914576, "grad_norm": 0.12446287274360657, "learning_rate": 0.0001, "loss": 0.1993, "step": 819 }, { "epoch": 1.3368112161721553, "grad_norm": 0.1500684767961502, "learning_rate": 0.0001, "loss": 0.2065, "step": 820 }, { "epoch": 1.338441473752853, "grad_norm": 0.11526080965995789, "learning_rate": 0.0001, "loss": 0.1929, "step": 821 }, { "epoch": 1.3400717313335506, "grad_norm": 0.14538566768169403, "learning_rate": 0.0001, "loss": 0.1951, "step": 822 }, { "epoch": 1.3417019889142485, "grad_norm": 0.11737110465765, "learning_rate": 0.0001, "loss": 0.2022, "step": 823 }, { "epoch": 1.3433322464949462, "grad_norm": 0.1339058130979538, "learning_rate": 0.0001, "loss": 0.2056, "step": 824 }, { "epoch": 1.3449625040756439, "grad_norm": 0.1357828974723816, "learning_rate": 0.0001, "loss": 0.1997, "step": 825 }, { "epoch": 1.3465927616563418, "grad_norm": 0.13894008100032806, "learning_rate": 0.0001, "loss": 0.1976, "step": 826 }, { "epoch": 1.3482230192370395, "grad_norm": 0.11640580743551254, "learning_rate": 0.0001, "loss": 0.1931, "step": 827 }, { "epoch": 1.3498532768177371, "grad_norm": 0.11283834278583527, "learning_rate": 0.0001, "loss": 0.1968, "step": 828 }, { "epoch": 1.3514835343984348, "grad_norm": 0.13692787289619446, "learning_rate": 0.0001, "loss": 0.2087, "step": 829 }, { "epoch": 1.3531137919791327, "grad_norm": 0.0954209491610527, "learning_rate": 0.0001, "loss": 0.1828, "step": 830 }, { "epoch": 1.3547440495598304, "grad_norm": 0.1314362734556198, "learning_rate": 0.0001, "loss": 0.2074, "step": 831 }, { "epoch": 1.3563743071405283, "grad_norm": 0.12032657861709595, "learning_rate": 0.0001, "loss": 0.1797, "step": 832 }, { "epoch": 1.358004564721226, "grad_norm": 0.12045875936746597, "learning_rate": 0.0001, "loss": 0.1948, "step": 833 }, { "epoch": 1.3596348223019237, "grad_norm": 0.12139002233743668, "learning_rate": 0.0001, "loss": 0.2074, "step": 834 }, { "epoch": 1.3612650798826214, "grad_norm": 0.125333771109581, "learning_rate": 0.0001, "loss": 0.189, "step": 835 }, { "epoch": 1.3628953374633193, "grad_norm": 0.12336897104978561, "learning_rate": 0.0001, "loss": 0.2015, "step": 836 }, { "epoch": 1.364525595044017, "grad_norm": 0.1447877436876297, "learning_rate": 0.0001, "loss": 0.1979, "step": 837 }, { "epoch": 1.3661558526247148, "grad_norm": 0.12751318514347076, "learning_rate": 0.0001, "loss": 0.1924, "step": 838 }, { "epoch": 1.3677861102054125, "grad_norm": 0.14476466178894043, "learning_rate": 0.0001, "loss": 0.2007, "step": 839 }, { "epoch": 1.3694163677861102, "grad_norm": 0.12900257110595703, "learning_rate": 0.0001, "loss": 0.1685, "step": 840 }, { "epoch": 1.3710466253668079, "grad_norm": 0.14788837730884552, "learning_rate": 0.0001, "loss": 0.188, "step": 841 }, { "epoch": 1.3726768829475058, "grad_norm": 0.1143098995089531, "learning_rate": 0.0001, "loss": 0.1925, "step": 842 }, { "epoch": 1.3743071405282035, "grad_norm": 0.16243141889572144, "learning_rate": 0.0001, "loss": 0.1904, "step": 843 }, { "epoch": 1.3759373981089011, "grad_norm": 0.1285100132226944, "learning_rate": 0.0001, "loss": 0.2101, "step": 844 }, { "epoch": 1.377567655689599, "grad_norm": 0.12278752028942108, "learning_rate": 0.0001, "loss": 0.1676, "step": 845 }, { "epoch": 1.3791979132702967, "grad_norm": 0.11762263625860214, "learning_rate": 0.0001, "loss": 0.1857, "step": 846 }, { "epoch": 1.3808281708509944, "grad_norm": 0.14315438270568848, "learning_rate": 0.0001, "loss": 0.1981, "step": 847 }, { "epoch": 1.382458428431692, "grad_norm": 0.13514509797096252, "learning_rate": 0.0001, "loss": 0.1874, "step": 848 }, { "epoch": 1.38408868601239, "grad_norm": 0.14663557708263397, "learning_rate": 0.0001, "loss": 0.1922, "step": 849 }, { "epoch": 1.3857189435930877, "grad_norm": 0.11271989345550537, "learning_rate": 0.0001, "loss": 0.1741, "step": 850 }, { "epoch": 1.3873492011737856, "grad_norm": 0.1267681121826172, "learning_rate": 0.0001, "loss": 0.2004, "step": 851 }, { "epoch": 1.3889794587544833, "grad_norm": 0.13862234354019165, "learning_rate": 0.0001, "loss": 0.1833, "step": 852 }, { "epoch": 1.390609716335181, "grad_norm": 0.14076252281665802, "learning_rate": 0.0001, "loss": 0.1946, "step": 853 }, { "epoch": 1.3922399739158786, "grad_norm": 0.11336036026477814, "learning_rate": 0.0001, "loss": 0.1826, "step": 854 }, { "epoch": 1.3938702314965765, "grad_norm": 0.12610171735286713, "learning_rate": 0.0001, "loss": 0.2061, "step": 855 }, { "epoch": 1.3955004890772742, "grad_norm": 0.12771809101104736, "learning_rate": 0.0001, "loss": 0.1826, "step": 856 }, { "epoch": 1.3971307466579719, "grad_norm": 0.13943758606910706, "learning_rate": 0.0001, "loss": 0.2154, "step": 857 }, { "epoch": 1.3987610042386698, "grad_norm": 0.1181144267320633, "learning_rate": 0.0001, "loss": 0.2047, "step": 858 }, { "epoch": 1.4003912618193675, "grad_norm": 0.12739057838916779, "learning_rate": 0.0001, "loss": 0.1924, "step": 859 }, { "epoch": 1.4020215194000651, "grad_norm": 0.14955663681030273, "learning_rate": 0.0001, "loss": 0.204, "step": 860 }, { "epoch": 1.4036517769807628, "grad_norm": 0.12984280288219452, "learning_rate": 0.0001, "loss": 0.1833, "step": 861 }, { "epoch": 1.4052820345614607, "grad_norm": 0.1367812603712082, "learning_rate": 0.0001, "loss": 0.2041, "step": 862 }, { "epoch": 1.4069122921421584, "grad_norm": 0.11790581792593002, "learning_rate": 0.0001, "loss": 0.1927, "step": 863 }, { "epoch": 1.4085425497228563, "grad_norm": 0.13845130801200867, "learning_rate": 0.0001, "loss": 0.198, "step": 864 }, { "epoch": 1.410172807303554, "grad_norm": 0.13803738355636597, "learning_rate": 0.0001, "loss": 0.1942, "step": 865 }, { "epoch": 1.4118030648842517, "grad_norm": 0.11733663082122803, "learning_rate": 0.0001, "loss": 0.1811, "step": 866 }, { "epoch": 1.4134333224649493, "grad_norm": 0.14048326015472412, "learning_rate": 0.0001, "loss": 0.2063, "step": 867 }, { "epoch": 1.4150635800456473, "grad_norm": 0.11841481178998947, "learning_rate": 0.0001, "loss": 0.1783, "step": 868 }, { "epoch": 1.416693837626345, "grad_norm": 0.13812898099422455, "learning_rate": 0.0001, "loss": 0.1945, "step": 869 }, { "epoch": 1.4183240952070428, "grad_norm": 0.12648576498031616, "learning_rate": 0.0001, "loss": 0.2121, "step": 870 }, { "epoch": 1.4199543527877405, "grad_norm": 0.12122874706983566, "learning_rate": 0.0001, "loss": 0.198, "step": 871 }, { "epoch": 1.4215846103684382, "grad_norm": 0.12100232392549515, "learning_rate": 0.0001, "loss": 0.1896, "step": 872 }, { "epoch": 1.4232148679491359, "grad_norm": 0.11330271512269974, "learning_rate": 0.0001, "loss": 0.1883, "step": 873 }, { "epoch": 1.4248451255298338, "grad_norm": 0.12889273464679718, "learning_rate": 0.0001, "loss": 0.1913, "step": 874 }, { "epoch": 1.4264753831105315, "grad_norm": 0.12529848515987396, "learning_rate": 0.0001, "loss": 0.1947, "step": 875 }, { "epoch": 1.4281056406912291, "grad_norm": 0.11464565992355347, "learning_rate": 0.0001, "loss": 0.1928, "step": 876 }, { "epoch": 1.429735898271927, "grad_norm": 0.09968235343694687, "learning_rate": 0.0001, "loss": 0.18, "step": 877 }, { "epoch": 1.4313661558526247, "grad_norm": 0.12522117793560028, "learning_rate": 0.0001, "loss": 0.2025, "step": 878 }, { "epoch": 1.4329964134333224, "grad_norm": 0.13101626932621002, "learning_rate": 0.0001, "loss": 0.1883, "step": 879 }, { "epoch": 1.43462667101402, "grad_norm": 0.18192481994628906, "learning_rate": 0.0001, "loss": 0.1906, "step": 880 }, { "epoch": 1.436256928594718, "grad_norm": 0.1182754710316658, "learning_rate": 0.0001, "loss": 0.1737, "step": 881 }, { "epoch": 1.4378871861754157, "grad_norm": 0.1254081428050995, "learning_rate": 0.0001, "loss": 0.1919, "step": 882 }, { "epoch": 1.4395174437561136, "grad_norm": 0.12255612015724182, "learning_rate": 0.0001, "loss": 0.1998, "step": 883 }, { "epoch": 1.4411477013368112, "grad_norm": 0.12978112697601318, "learning_rate": 0.0001, "loss": 0.189, "step": 884 }, { "epoch": 1.442777958917509, "grad_norm": 0.1313999891281128, "learning_rate": 0.0001, "loss": 0.1847, "step": 885 }, { "epoch": 1.4444082164982066, "grad_norm": 0.1327790915966034, "learning_rate": 0.0001, "loss": 0.2005, "step": 886 }, { "epoch": 1.4460384740789045, "grad_norm": 0.13885673880577087, "learning_rate": 0.0001, "loss": 0.1899, "step": 887 }, { "epoch": 1.4476687316596022, "grad_norm": 0.13544884324073792, "learning_rate": 0.0001, "loss": 0.1944, "step": 888 }, { "epoch": 1.4492989892403, "grad_norm": 0.12309068441390991, "learning_rate": 0.0001, "loss": 0.2011, "step": 889 }, { "epoch": 1.4509292468209978, "grad_norm": 0.11212994158267975, "learning_rate": 0.0001, "loss": 0.1709, "step": 890 }, { "epoch": 1.4525595044016955, "grad_norm": 0.1351044625043869, "learning_rate": 0.0001, "loss": 0.1765, "step": 891 }, { "epoch": 1.4541897619823931, "grad_norm": 0.1273559033870697, "learning_rate": 0.0001, "loss": 0.1856, "step": 892 }, { "epoch": 1.455820019563091, "grad_norm": 0.12091701477766037, "learning_rate": 0.0001, "loss": 0.2017, "step": 893 }, { "epoch": 1.4574502771437887, "grad_norm": 0.12357049435377121, "learning_rate": 0.0001, "loss": 0.1997, "step": 894 }, { "epoch": 1.4590805347244864, "grad_norm": 0.13915151357650757, "learning_rate": 0.0001, "loss": 0.2016, "step": 895 }, { "epoch": 1.4607107923051843, "grad_norm": 0.15922588109970093, "learning_rate": 0.0001, "loss": 0.1946, "step": 896 }, { "epoch": 1.462341049885882, "grad_norm": 0.1327250450849533, "learning_rate": 0.0001, "loss": 0.2009, "step": 897 }, { "epoch": 1.4639713074665797, "grad_norm": 0.1281212717294693, "learning_rate": 0.0001, "loss": 0.1885, "step": 898 }, { "epoch": 1.4656015650472773, "grad_norm": 0.12630882859230042, "learning_rate": 0.0001, "loss": 0.1758, "step": 899 }, { "epoch": 1.4672318226279752, "grad_norm": 0.17888978123664856, "learning_rate": 0.0001, "loss": 0.1969, "step": 900 }, { "epoch": 1.468862080208673, "grad_norm": 0.11581540107727051, "learning_rate": 0.0001, "loss": 0.1811, "step": 901 }, { "epoch": 1.4704923377893708, "grad_norm": 0.13671360909938812, "learning_rate": 0.0001, "loss": 0.1921, "step": 902 }, { "epoch": 1.4721225953700685, "grad_norm": 0.12927068769931793, "learning_rate": 0.0001, "loss": 0.1907, "step": 903 }, { "epoch": 1.4737528529507662, "grad_norm": 0.1307104527950287, "learning_rate": 0.0001, "loss": 0.1774, "step": 904 }, { "epoch": 1.4753831105314639, "grad_norm": 0.12712356448173523, "learning_rate": 0.0001, "loss": 0.1945, "step": 905 }, { "epoch": 1.4770133681121618, "grad_norm": 0.11870065331459045, "learning_rate": 0.0001, "loss": 0.1764, "step": 906 }, { "epoch": 1.4786436256928595, "grad_norm": 0.13073444366455078, "learning_rate": 0.0001, "loss": 0.1941, "step": 907 }, { "epoch": 1.4802738832735574, "grad_norm": 0.12752722203731537, "learning_rate": 0.0001, "loss": 0.1781, "step": 908 }, { "epoch": 1.481904140854255, "grad_norm": 0.18502075970172882, "learning_rate": 0.0001, "loss": 0.1963, "step": 909 }, { "epoch": 1.4835343984349527, "grad_norm": 0.14644372463226318, "learning_rate": 0.0001, "loss": 0.2031, "step": 910 }, { "epoch": 1.4851646560156504, "grad_norm": 0.1375977247953415, "learning_rate": 0.0001, "loss": 0.1872, "step": 911 }, { "epoch": 1.4867949135963483, "grad_norm": 0.1250106245279312, "learning_rate": 0.0001, "loss": 0.1939, "step": 912 }, { "epoch": 1.488425171177046, "grad_norm": 0.1255251169204712, "learning_rate": 0.0001, "loss": 0.1918, "step": 913 }, { "epoch": 1.4900554287577437, "grad_norm": 0.11189655214548111, "learning_rate": 0.0001, "loss": 0.1897, "step": 914 }, { "epoch": 1.4916856863384416, "grad_norm": 0.12730088829994202, "learning_rate": 0.0001, "loss": 0.1828, "step": 915 }, { "epoch": 1.4933159439191392, "grad_norm": 0.10540137439966202, "learning_rate": 0.0001, "loss": 0.191, "step": 916 }, { "epoch": 1.494946201499837, "grad_norm": 0.10388045758008957, "learning_rate": 0.0001, "loss": 0.1904, "step": 917 }, { "epoch": 1.4965764590805346, "grad_norm": 0.1330886036157608, "learning_rate": 0.0001, "loss": 0.1946, "step": 918 }, { "epoch": 1.4982067166612325, "grad_norm": 0.12678848206996918, "learning_rate": 0.0001, "loss": 0.2083, "step": 919 }, { "epoch": 1.4998369742419302, "grad_norm": 0.12516173720359802, "learning_rate": 0.0001, "loss": 0.202, "step": 920 }, { "epoch": 1.501467231822628, "grad_norm": 0.1177758052945137, "learning_rate": 0.0001, "loss": 0.1943, "step": 921 }, { "epoch": 1.5030974894033258, "grad_norm": 0.10760951787233353, "learning_rate": 0.0001, "loss": 0.1738, "step": 922 }, { "epoch": 1.5047277469840235, "grad_norm": 0.1439739167690277, "learning_rate": 0.0001, "loss": 0.2107, "step": 923 }, { "epoch": 1.5063580045647211, "grad_norm": 0.127821683883667, "learning_rate": 0.0001, "loss": 0.1788, "step": 924 }, { "epoch": 1.5079882621454188, "grad_norm": 0.11863593012094498, "learning_rate": 0.0001, "loss": 0.1928, "step": 925 }, { "epoch": 1.5096185197261167, "grad_norm": 0.1257062703371048, "learning_rate": 0.0001, "loss": 0.1814, "step": 926 }, { "epoch": 1.5112487773068146, "grad_norm": 0.14376260340213776, "learning_rate": 0.0001, "loss": 0.1876, "step": 927 }, { "epoch": 1.5128790348875123, "grad_norm": 0.12747377157211304, "learning_rate": 0.0001, "loss": 0.2062, "step": 928 }, { "epoch": 1.51450929246821, "grad_norm": 0.10602065175771713, "learning_rate": 0.0001, "loss": 0.1834, "step": 929 }, { "epoch": 1.5161395500489077, "grad_norm": 0.11980943381786346, "learning_rate": 0.0001, "loss": 0.1932, "step": 930 }, { "epoch": 1.5177698076296053, "grad_norm": 0.12690620124340057, "learning_rate": 0.0001, "loss": 0.2067, "step": 931 }, { "epoch": 1.5194000652103032, "grad_norm": 0.10429069399833679, "learning_rate": 0.0001, "loss": 0.1845, "step": 932 }, { "epoch": 1.5210303227910011, "grad_norm": 0.11686432361602783, "learning_rate": 0.0001, "loss": 0.2038, "step": 933 }, { "epoch": 1.5226605803716988, "grad_norm": 0.13629446923732758, "learning_rate": 0.0001, "loss": 0.2071, "step": 934 }, { "epoch": 1.5242908379523965, "grad_norm": 0.11884860694408417, "learning_rate": 0.0001, "loss": 0.1866, "step": 935 }, { "epoch": 1.5259210955330942, "grad_norm": 0.11080071330070496, "learning_rate": 0.0001, "loss": 0.1925, "step": 936 }, { "epoch": 1.5275513531137919, "grad_norm": 0.13977265357971191, "learning_rate": 0.0001, "loss": 0.2035, "step": 937 }, { "epoch": 1.5291816106944898, "grad_norm": 0.1306087076663971, "learning_rate": 0.0001, "loss": 0.1906, "step": 938 }, { "epoch": 1.5308118682751874, "grad_norm": 0.1292889267206192, "learning_rate": 0.0001, "loss": 0.2064, "step": 939 }, { "epoch": 1.5324421258558854, "grad_norm": 0.1151227205991745, "learning_rate": 0.0001, "loss": 0.1889, "step": 940 }, { "epoch": 1.534072383436583, "grad_norm": 0.23857301473617554, "learning_rate": 0.0001, "loss": 0.1936, "step": 941 }, { "epoch": 1.5357026410172807, "grad_norm": 0.12873469293117523, "learning_rate": 0.0001, "loss": 0.1864, "step": 942 }, { "epoch": 1.5373328985979784, "grad_norm": 0.1140642762184143, "learning_rate": 0.0001, "loss": 0.1844, "step": 943 }, { "epoch": 1.538963156178676, "grad_norm": 0.12932021915912628, "learning_rate": 0.0001, "loss": 0.1717, "step": 944 }, { "epoch": 1.540593413759374, "grad_norm": 0.11678753048181534, "learning_rate": 0.0001, "loss": 0.1814, "step": 945 }, { "epoch": 1.5422236713400719, "grad_norm": 0.11084496229887009, "learning_rate": 0.0001, "loss": 0.1635, "step": 946 }, { "epoch": 1.5438539289207696, "grad_norm": 0.11934110522270203, "learning_rate": 0.0001, "loss": 0.1897, "step": 947 }, { "epoch": 1.5454841865014672, "grad_norm": 0.13100844621658325, "learning_rate": 0.0001, "loss": 0.1852, "step": 948 }, { "epoch": 1.547114444082165, "grad_norm": 0.12914130091667175, "learning_rate": 0.0001, "loss": 0.1799, "step": 949 }, { "epoch": 1.5487447016628626, "grad_norm": 0.1111481636762619, "learning_rate": 0.0001, "loss": 0.1854, "step": 950 }, { "epoch": 1.5503749592435605, "grad_norm": 0.1460474282503128, "learning_rate": 0.0001, "loss": 0.1864, "step": 951 }, { "epoch": 1.5520052168242582, "grad_norm": 0.13828031718730927, "learning_rate": 0.0001, "loss": 0.1893, "step": 952 }, { "epoch": 1.553635474404956, "grad_norm": 0.1216074749827385, "learning_rate": 0.0001, "loss": 0.1866, "step": 953 }, { "epoch": 1.5552657319856538, "grad_norm": 0.11165913194417953, "learning_rate": 0.0001, "loss": 0.1858, "step": 954 }, { "epoch": 1.5568959895663514, "grad_norm": 0.12925301492214203, "learning_rate": 0.0001, "loss": 0.198, "step": 955 }, { "epoch": 1.5585262471470491, "grad_norm": 0.10411135852336884, "learning_rate": 0.0001, "loss": 0.1836, "step": 956 }, { "epoch": 1.560156504727747, "grad_norm": 0.1324637234210968, "learning_rate": 0.0001, "loss": 0.1752, "step": 957 }, { "epoch": 1.5617867623084447, "grad_norm": 0.1328878253698349, "learning_rate": 0.0001, "loss": 0.1861, "step": 958 }, { "epoch": 1.5634170198891426, "grad_norm": 0.1299583464860916, "learning_rate": 0.0001, "loss": 0.192, "step": 959 }, { "epoch": 1.5650472774698403, "grad_norm": 0.14747850596904755, "learning_rate": 0.0001, "loss": 0.1826, "step": 960 }, { "epoch": 1.566677535050538, "grad_norm": 0.1322248876094818, "learning_rate": 0.0001, "loss": 0.1924, "step": 961 }, { "epoch": 1.5683077926312357, "grad_norm": 0.14771921932697296, "learning_rate": 0.0001, "loss": 0.1908, "step": 962 }, { "epoch": 1.5699380502119333, "grad_norm": 0.12810666859149933, "learning_rate": 0.0001, "loss": 0.1933, "step": 963 }, { "epoch": 1.5715683077926312, "grad_norm": 0.12468911707401276, "learning_rate": 0.0001, "loss": 0.1969, "step": 964 }, { "epoch": 1.5731985653733291, "grad_norm": 0.13675253093242645, "learning_rate": 0.0001, "loss": 0.1838, "step": 965 }, { "epoch": 1.5748288229540268, "grad_norm": 0.14610256254673004, "learning_rate": 0.0001, "loss": 0.1871, "step": 966 }, { "epoch": 1.5764590805347245, "grad_norm": 0.16016973555088043, "learning_rate": 0.0001, "loss": 0.2068, "step": 967 }, { "epoch": 1.5780893381154222, "grad_norm": 0.11924766004085541, "learning_rate": 0.0001, "loss": 0.1963, "step": 968 }, { "epoch": 1.5797195956961199, "grad_norm": 0.14039623737335205, "learning_rate": 0.0001, "loss": 0.193, "step": 969 }, { "epoch": 1.5813498532768178, "grad_norm": 0.13962414860725403, "learning_rate": 0.0001, "loss": 0.2006, "step": 970 }, { "epoch": 1.5829801108575154, "grad_norm": 0.1010158583521843, "learning_rate": 0.0001, "loss": 0.1793, "step": 971 }, { "epoch": 1.5846103684382133, "grad_norm": 0.13020698726177216, "learning_rate": 0.0001, "loss": 0.1936, "step": 972 }, { "epoch": 1.586240626018911, "grad_norm": 0.13899926841259003, "learning_rate": 0.0001, "loss": 0.1855, "step": 973 }, { "epoch": 1.5878708835996087, "grad_norm": 0.13506311178207397, "learning_rate": 0.0001, "loss": 0.2038, "step": 974 }, { "epoch": 1.5895011411803064, "grad_norm": 0.10924082249403, "learning_rate": 0.0001, "loss": 0.189, "step": 975 }, { "epoch": 1.5911313987610043, "grad_norm": 0.14923515915870667, "learning_rate": 0.0001, "loss": 0.1987, "step": 976 }, { "epoch": 1.592761656341702, "grad_norm": 0.11555776745080948, "learning_rate": 0.0001, "loss": 0.1819, "step": 977 }, { "epoch": 1.5943919139223999, "grad_norm": 0.11343539506196976, "learning_rate": 0.0001, "loss": 0.1926, "step": 978 }, { "epoch": 1.5960221715030976, "grad_norm": 0.12200161069631577, "learning_rate": 0.0001, "loss": 0.1883, "step": 979 }, { "epoch": 1.5976524290837952, "grad_norm": 0.14577162265777588, "learning_rate": 0.0001, "loss": 0.1957, "step": 980 }, { "epoch": 1.599282686664493, "grad_norm": 0.12125492095947266, "learning_rate": 0.0001, "loss": 0.1873, "step": 981 }, { "epoch": 1.6009129442451906, "grad_norm": 0.15497727692127228, "learning_rate": 0.0001, "loss": 0.2019, "step": 982 }, { "epoch": 1.6025432018258885, "grad_norm": 0.13305650651454926, "learning_rate": 0.0001, "loss": 0.1951, "step": 983 }, { "epoch": 1.6041734594065864, "grad_norm": 0.13118699193000793, "learning_rate": 0.0001, "loss": 0.1704, "step": 984 }, { "epoch": 1.605803716987284, "grad_norm": 0.12898042798042297, "learning_rate": 0.0001, "loss": 0.1937, "step": 985 }, { "epoch": 1.6074339745679818, "grad_norm": 0.13094180822372437, "learning_rate": 0.0001, "loss": 0.1938, "step": 986 }, { "epoch": 1.6090642321486794, "grad_norm": 0.13454465568065643, "learning_rate": 0.0001, "loss": 0.1833, "step": 987 }, { "epoch": 1.6106944897293771, "grad_norm": 0.10670379549264908, "learning_rate": 0.0001, "loss": 0.1951, "step": 988 }, { "epoch": 1.612324747310075, "grad_norm": 0.1214357241988182, "learning_rate": 0.0001, "loss": 0.1837, "step": 989 }, { "epoch": 1.6139550048907727, "grad_norm": 0.12821504473686218, "learning_rate": 0.0001, "loss": 0.1937, "step": 990 }, { "epoch": 1.6155852624714706, "grad_norm": 0.1238052025437355, "learning_rate": 0.0001, "loss": 0.2057, "step": 991 }, { "epoch": 1.6172155200521683, "grad_norm": 0.15875551104545593, "learning_rate": 0.0001, "loss": 0.2037, "step": 992 }, { "epoch": 1.618845777632866, "grad_norm": 0.12168974429368973, "learning_rate": 0.0001, "loss": 0.1965, "step": 993 }, { "epoch": 1.6204760352135636, "grad_norm": 0.20948785543441772, "learning_rate": 0.0001, "loss": 0.1885, "step": 994 }, { "epoch": 1.6221062927942613, "grad_norm": 0.12937723100185394, "learning_rate": 0.0001, "loss": 0.1893, "step": 995 }, { "epoch": 1.6237365503749592, "grad_norm": 0.11235075443983078, "learning_rate": 0.0001, "loss": 0.185, "step": 996 }, { "epoch": 1.6253668079556571, "grad_norm": 0.1234828382730484, "learning_rate": 0.0001, "loss": 0.1938, "step": 997 }, { "epoch": 1.6269970655363548, "grad_norm": 0.11725595593452454, "learning_rate": 0.0001, "loss": 0.1844, "step": 998 }, { "epoch": 1.6286273231170525, "grad_norm": 0.1212792843580246, "learning_rate": 0.0001, "loss": 0.2011, "step": 999 }, { "epoch": 1.6302575806977502, "grad_norm": 0.12212159484624863, "learning_rate": 0.0001, "loss": 0.1965, "step": 1000 }, { "epoch": 1.6302575806977502, "eval_loss": 0.20038433372974396, "eval_runtime": 2056.6687, "eval_samples_per_second": 0.918, "eval_steps_per_second": 0.229, "step": 1000 }, { "epoch": 1.6318878382784479, "grad_norm": 0.102206751704216, "learning_rate": 0.0001, "loss": 0.1835, "step": 1001 }, { "epoch": 1.6335180958591458, "grad_norm": 0.12285874038934708, "learning_rate": 0.0001, "loss": 0.1868, "step": 1002 }, { "epoch": 1.6351483534398437, "grad_norm": 0.12729693949222565, "learning_rate": 0.0001, "loss": 0.185, "step": 1003 }, { "epoch": 1.6367786110205413, "grad_norm": 0.12659911811351776, "learning_rate": 0.0001, "loss": 0.1847, "step": 1004 }, { "epoch": 1.638408868601239, "grad_norm": 0.13969117403030396, "learning_rate": 0.0001, "loss": 0.1903, "step": 1005 }, { "epoch": 1.6400391261819367, "grad_norm": 0.14476704597473145, "learning_rate": 0.0001, "loss": 0.1901, "step": 1006 }, { "epoch": 1.6416693837626344, "grad_norm": 0.15515267848968506, "learning_rate": 0.0001, "loss": 0.206, "step": 1007 }, { "epoch": 1.6432996413433323, "grad_norm": 0.140212744474411, "learning_rate": 0.0001, "loss": 0.1761, "step": 1008 }, { "epoch": 1.64492989892403, "grad_norm": 0.149306982755661, "learning_rate": 0.0001, "loss": 0.1917, "step": 1009 }, { "epoch": 1.6465601565047279, "grad_norm": 0.12449630349874496, "learning_rate": 0.0001, "loss": 0.1913, "step": 1010 }, { "epoch": 1.6481904140854255, "grad_norm": 0.11689987778663635, "learning_rate": 0.0001, "loss": 0.1709, "step": 1011 }, { "epoch": 1.6498206716661232, "grad_norm": 0.14068616926670074, "learning_rate": 0.0001, "loss": 0.1958, "step": 1012 }, { "epoch": 1.651450929246821, "grad_norm": 0.11247096955776215, "learning_rate": 0.0001, "loss": 0.206, "step": 1013 }, { "epoch": 1.6530811868275186, "grad_norm": 0.1336074322462082, "learning_rate": 0.0001, "loss": 0.1828, "step": 1014 }, { "epoch": 1.6547114444082165, "grad_norm": 0.1070534735918045, "learning_rate": 0.0001, "loss": 0.188, "step": 1015 }, { "epoch": 1.6563417019889144, "grad_norm": 0.1153770238161087, "learning_rate": 0.0001, "loss": 0.1818, "step": 1016 }, { "epoch": 1.657971959569612, "grad_norm": 0.12715262174606323, "learning_rate": 0.0001, "loss": 0.1945, "step": 1017 }, { "epoch": 1.6596022171503098, "grad_norm": 0.13281576335430145, "learning_rate": 0.0001, "loss": 0.1821, "step": 1018 }, { "epoch": 1.6612324747310074, "grad_norm": 0.12460318207740784, "learning_rate": 0.0001, "loss": 0.1999, "step": 1019 }, { "epoch": 1.6628627323117051, "grad_norm": 0.12163477391004562, "learning_rate": 0.0001, "loss": 0.1883, "step": 1020 }, { "epoch": 1.664492989892403, "grad_norm": 0.11481994390487671, "learning_rate": 0.0001, "loss": 0.1898, "step": 1021 }, { "epoch": 1.6661232474731007, "grad_norm": 0.11116690933704376, "learning_rate": 0.0001, "loss": 0.1947, "step": 1022 }, { "epoch": 1.6677535050537986, "grad_norm": 0.12174486368894577, "learning_rate": 0.0001, "loss": 0.1876, "step": 1023 }, { "epoch": 1.6693837626344963, "grad_norm": 0.13974875211715698, "learning_rate": 0.0001, "loss": 0.1769, "step": 1024 }, { "epoch": 1.671014020215194, "grad_norm": 0.10470610111951828, "learning_rate": 0.0001, "loss": 0.1814, "step": 1025 }, { "epoch": 1.6726442777958916, "grad_norm": 0.11099997162818909, "learning_rate": 0.0001, "loss": 0.1944, "step": 1026 }, { "epoch": 1.6742745353765895, "grad_norm": 0.11604826152324677, "learning_rate": 0.0001, "loss": 0.1937, "step": 1027 }, { "epoch": 1.6759047929572872, "grad_norm": 0.10957235097885132, "learning_rate": 0.0001, "loss": 0.1756, "step": 1028 }, { "epoch": 1.6775350505379851, "grad_norm": 0.13742849230766296, "learning_rate": 0.0001, "loss": 0.182, "step": 1029 }, { "epoch": 1.6791653081186828, "grad_norm": 0.12097762525081635, "learning_rate": 0.0001, "loss": 0.1996, "step": 1030 }, { "epoch": 1.6807955656993805, "grad_norm": 0.12486281245946884, "learning_rate": 0.0001, "loss": 0.2007, "step": 1031 }, { "epoch": 1.6824258232800782, "grad_norm": 0.112356036901474, "learning_rate": 0.0001, "loss": 0.1873, "step": 1032 }, { "epoch": 1.6840560808607759, "grad_norm": 0.11297633498907089, "learning_rate": 0.0001, "loss": 0.1872, "step": 1033 }, { "epoch": 1.6856863384414738, "grad_norm": 0.13877838850021362, "learning_rate": 0.0001, "loss": 0.2053, "step": 1034 }, { "epoch": 1.6873165960221717, "grad_norm": 0.144087016582489, "learning_rate": 0.0001, "loss": 0.1827, "step": 1035 }, { "epoch": 1.6889468536028693, "grad_norm": 0.1187191903591156, "learning_rate": 0.0001, "loss": 0.1854, "step": 1036 }, { "epoch": 1.690577111183567, "grad_norm": 0.15161648392677307, "learning_rate": 0.0001, "loss": 0.1974, "step": 1037 }, { "epoch": 1.6922073687642647, "grad_norm": 0.1534930318593979, "learning_rate": 0.0001, "loss": 0.1882, "step": 1038 }, { "epoch": 1.6938376263449624, "grad_norm": 0.09909740835428238, "learning_rate": 0.0001, "loss": 0.1792, "step": 1039 }, { "epoch": 1.6954678839256603, "grad_norm": 0.13875959813594818, "learning_rate": 0.0001, "loss": 0.1884, "step": 1040 }, { "epoch": 1.697098141506358, "grad_norm": 0.1244412511587143, "learning_rate": 0.0001, "loss": 0.1943, "step": 1041 }, { "epoch": 1.6987283990870559, "grad_norm": 0.12313511967658997, "learning_rate": 0.0001, "loss": 0.1918, "step": 1042 }, { "epoch": 1.7003586566677535, "grad_norm": 0.12056350708007812, "learning_rate": 0.0001, "loss": 0.1751, "step": 1043 }, { "epoch": 1.7019889142484512, "grad_norm": 0.14450062811374664, "learning_rate": 0.0001, "loss": 0.1998, "step": 1044 }, { "epoch": 1.703619171829149, "grad_norm": 0.13067997992038727, "learning_rate": 0.0001, "loss": 0.1956, "step": 1045 }, { "epoch": 1.7052494294098468, "grad_norm": 0.1026124432682991, "learning_rate": 0.0001, "loss": 0.1906, "step": 1046 }, { "epoch": 1.7068796869905445, "grad_norm": 0.12087104469537735, "learning_rate": 0.0001, "loss": 0.1803, "step": 1047 }, { "epoch": 1.7085099445712424, "grad_norm": 0.12124813348054886, "learning_rate": 0.0001, "loss": 0.1974, "step": 1048 }, { "epoch": 1.71014020215194, "grad_norm": 0.12620921432971954, "learning_rate": 0.0001, "loss": 0.1952, "step": 1049 }, { "epoch": 1.7117704597326378, "grad_norm": 0.12863439321517944, "learning_rate": 0.0001, "loss": 0.1943, "step": 1050 }, { "epoch": 1.7134007173133354, "grad_norm": 0.11415056884288788, "learning_rate": 0.0001, "loss": 0.1928, "step": 1051 }, { "epoch": 1.7150309748940331, "grad_norm": 0.11588121205568314, "learning_rate": 0.0001, "loss": 0.1833, "step": 1052 }, { "epoch": 1.716661232474731, "grad_norm": 0.11711575090885162, "learning_rate": 0.0001, "loss": 0.1854, "step": 1053 }, { "epoch": 1.718291490055429, "grad_norm": 0.11321627348661423, "learning_rate": 0.0001, "loss": 0.1908, "step": 1054 }, { "epoch": 1.7199217476361266, "grad_norm": 0.11224471032619476, "learning_rate": 0.0001, "loss": 0.1812, "step": 1055 }, { "epoch": 1.7215520052168243, "grad_norm": 0.09902875125408173, "learning_rate": 0.0001, "loss": 0.185, "step": 1056 }, { "epoch": 1.723182262797522, "grad_norm": 0.12637241184711456, "learning_rate": 0.0001, "loss": 0.2065, "step": 1057 }, { "epoch": 1.7248125203782196, "grad_norm": 0.10967186093330383, "learning_rate": 0.0001, "loss": 0.1789, "step": 1058 }, { "epoch": 1.7264427779589175, "grad_norm": 0.11306209117174149, "learning_rate": 0.0001, "loss": 0.1848, "step": 1059 }, { "epoch": 1.7280730355396152, "grad_norm": 0.11642675846815109, "learning_rate": 0.0001, "loss": 0.19, "step": 1060 }, { "epoch": 1.7297032931203131, "grad_norm": 0.14287948608398438, "learning_rate": 0.0001, "loss": 0.198, "step": 1061 }, { "epoch": 1.7313335507010108, "grad_norm": 0.14489899575710297, "learning_rate": 0.0001, "loss": 0.1898, "step": 1062 }, { "epoch": 1.7329638082817085, "grad_norm": 0.12616468966007233, "learning_rate": 0.0001, "loss": 0.181, "step": 1063 }, { "epoch": 1.7345940658624062, "grad_norm": 0.1419239342212677, "learning_rate": 0.0001, "loss": 0.1981, "step": 1064 }, { "epoch": 1.7362243234431038, "grad_norm": 0.12401966750621796, "learning_rate": 0.0001, "loss": 0.1822, "step": 1065 }, { "epoch": 1.7378545810238017, "grad_norm": 0.15133000910282135, "learning_rate": 0.0001, "loss": 0.1974, "step": 1066 }, { "epoch": 1.7394848386044997, "grad_norm": 0.1293191760778427, "learning_rate": 0.0001, "loss": 0.1904, "step": 1067 }, { "epoch": 1.7411150961851973, "grad_norm": 0.12822459638118744, "learning_rate": 0.0001, "loss": 0.1893, "step": 1068 }, { "epoch": 1.742745353765895, "grad_norm": 0.10721483081579208, "learning_rate": 0.0001, "loss": 0.1752, "step": 1069 }, { "epoch": 1.7443756113465927, "grad_norm": 0.13893002271652222, "learning_rate": 0.0001, "loss": 0.1981, "step": 1070 }, { "epoch": 1.7460058689272904, "grad_norm": 0.1082753837108612, "learning_rate": 0.0001, "loss": 0.1925, "step": 1071 }, { "epoch": 1.7476361265079883, "grad_norm": 0.12488586455583572, "learning_rate": 0.0001, "loss": 0.1759, "step": 1072 }, { "epoch": 1.7492663840886862, "grad_norm": 0.10307318717241287, "learning_rate": 0.0001, "loss": 0.1685, "step": 1073 }, { "epoch": 1.7508966416693839, "grad_norm": 0.12732993066310883, "learning_rate": 0.0001, "loss": 0.1843, "step": 1074 }, { "epoch": 1.7525268992500815, "grad_norm": 0.12339639663696289, "learning_rate": 0.0001, "loss": 0.185, "step": 1075 }, { "epoch": 1.7541571568307792, "grad_norm": 0.10467349737882614, "learning_rate": 0.0001, "loss": 0.1787, "step": 1076 }, { "epoch": 1.755787414411477, "grad_norm": 0.1297389268875122, "learning_rate": 0.0001, "loss": 0.1865, "step": 1077 }, { "epoch": 1.7574176719921748, "grad_norm": 0.13278993964195251, "learning_rate": 0.0001, "loss": 0.1872, "step": 1078 }, { "epoch": 1.7590479295728725, "grad_norm": 0.125654399394989, "learning_rate": 0.0001, "loss": 0.1731, "step": 1079 }, { "epoch": 1.7606781871535704, "grad_norm": 0.12131417542695999, "learning_rate": 0.0001, "loss": 0.1717, "step": 1080 }, { "epoch": 1.762308444734268, "grad_norm": 0.12412076443433762, "learning_rate": 0.0001, "loss": 0.1819, "step": 1081 }, { "epoch": 1.7639387023149657, "grad_norm": 0.12504039704799652, "learning_rate": 0.0001, "loss": 0.1714, "step": 1082 }, { "epoch": 1.7655689598956634, "grad_norm": 0.12615200877189636, "learning_rate": 0.0001, "loss": 0.1982, "step": 1083 }, { "epoch": 1.767199217476361, "grad_norm": 0.13149379193782806, "learning_rate": 0.0001, "loss": 0.1983, "step": 1084 }, { "epoch": 1.768829475057059, "grad_norm": 0.13638049364089966, "learning_rate": 0.0001, "loss": 0.1938, "step": 1085 }, { "epoch": 1.770459732637757, "grad_norm": 0.10553500801324844, "learning_rate": 0.0001, "loss": 0.1867, "step": 1086 }, { "epoch": 1.7720899902184546, "grad_norm": 0.13788992166519165, "learning_rate": 0.0001, "loss": 0.1825, "step": 1087 }, { "epoch": 1.7737202477991523, "grad_norm": 0.11936747282743454, "learning_rate": 0.0001, "loss": 0.1885, "step": 1088 }, { "epoch": 1.77535050537985, "grad_norm": 0.133977010846138, "learning_rate": 0.0001, "loss": 0.1954, "step": 1089 }, { "epoch": 1.7769807629605476, "grad_norm": 0.12340915203094482, "learning_rate": 0.0001, "loss": 0.1825, "step": 1090 }, { "epoch": 1.7786110205412455, "grad_norm": 0.11585668474435806, "learning_rate": 0.0001, "loss": 0.1854, "step": 1091 }, { "epoch": 1.7802412781219432, "grad_norm": 0.1759333461523056, "learning_rate": 0.0001, "loss": 0.1947, "step": 1092 }, { "epoch": 1.7818715357026411, "grad_norm": 0.12591791152954102, "learning_rate": 0.0001, "loss": 0.1897, "step": 1093 }, { "epoch": 1.7835017932833388, "grad_norm": 0.11980108171701431, "learning_rate": 0.0001, "loss": 0.1962, "step": 1094 }, { "epoch": 1.7851320508640365, "grad_norm": 0.12715961039066315, "learning_rate": 0.0001, "loss": 0.1961, "step": 1095 }, { "epoch": 1.7867623084447342, "grad_norm": 0.1274608075618744, "learning_rate": 0.0001, "loss": 0.1855, "step": 1096 }, { "epoch": 1.788392566025432, "grad_norm": 0.15268570184707642, "learning_rate": 0.0001, "loss": 0.1918, "step": 1097 }, { "epoch": 1.7900228236061297, "grad_norm": 0.1284448206424713, "learning_rate": 0.0001, "loss": 0.1912, "step": 1098 }, { "epoch": 1.7916530811868276, "grad_norm": 0.1219286322593689, "learning_rate": 0.0001, "loss": 0.1792, "step": 1099 }, { "epoch": 1.7932833387675253, "grad_norm": 0.11822406947612762, "learning_rate": 0.0001, "loss": 0.1838, "step": 1100 }, { "epoch": 1.794913596348223, "grad_norm": 0.12364920228719711, "learning_rate": 0.0001, "loss": 0.1897, "step": 1101 }, { "epoch": 1.7965438539289207, "grad_norm": 0.11299461871385574, "learning_rate": 0.0001, "loss": 0.1955, "step": 1102 }, { "epoch": 1.7981741115096184, "grad_norm": 0.13961385190486908, "learning_rate": 0.0001, "loss": 0.1899, "step": 1103 }, { "epoch": 1.7998043690903163, "grad_norm": 0.1028355211019516, "learning_rate": 0.0001, "loss": 0.1776, "step": 1104 }, { "epoch": 1.8014346266710142, "grad_norm": 0.107155442237854, "learning_rate": 0.0001, "loss": 0.1731, "step": 1105 }, { "epoch": 1.8030648842517119, "grad_norm": 0.11978352069854736, "learning_rate": 0.0001, "loss": 0.1929, "step": 1106 }, { "epoch": 1.8046951418324095, "grad_norm": 0.12182936072349548, "learning_rate": 0.0001, "loss": 0.1874, "step": 1107 }, { "epoch": 1.8063253994131072, "grad_norm": 0.1270841509103775, "learning_rate": 0.0001, "loss": 0.1832, "step": 1108 }, { "epoch": 1.807955656993805, "grad_norm": 0.10812801122665405, "learning_rate": 0.0001, "loss": 0.1704, "step": 1109 }, { "epoch": 1.8095859145745028, "grad_norm": 0.11686091870069504, "learning_rate": 0.0001, "loss": 0.1765, "step": 1110 }, { "epoch": 1.8112161721552005, "grad_norm": 0.1108395904302597, "learning_rate": 0.0001, "loss": 0.1832, "step": 1111 }, { "epoch": 1.8128464297358984, "grad_norm": 0.13820597529411316, "learning_rate": 0.0001, "loss": 0.186, "step": 1112 }, { "epoch": 1.814476687316596, "grad_norm": 0.1270742267370224, "learning_rate": 0.0001, "loss": 0.191, "step": 1113 }, { "epoch": 1.8161069448972937, "grad_norm": 0.160540372133255, "learning_rate": 0.0001, "loss": 0.1821, "step": 1114 }, { "epoch": 1.8177372024779914, "grad_norm": 0.11482474952936172, "learning_rate": 0.0001, "loss": 0.1914, "step": 1115 }, { "epoch": 1.8193674600586893, "grad_norm": 0.13658450543880463, "learning_rate": 0.0001, "loss": 0.173, "step": 1116 }, { "epoch": 1.820997717639387, "grad_norm": 0.10865119844675064, "learning_rate": 0.0001, "loss": 0.1821, "step": 1117 }, { "epoch": 1.822627975220085, "grad_norm": 0.10327646881341934, "learning_rate": 0.0001, "loss": 0.1744, "step": 1118 }, { "epoch": 1.8242582328007826, "grad_norm": 0.10378043353557587, "learning_rate": 0.0001, "loss": 0.1717, "step": 1119 }, { "epoch": 1.8258884903814803, "grad_norm": 0.11684548109769821, "learning_rate": 0.0001, "loss": 0.1893, "step": 1120 }, { "epoch": 1.827518747962178, "grad_norm": 0.1185649037361145, "learning_rate": 0.0001, "loss": 0.1865, "step": 1121 }, { "epoch": 1.8291490055428756, "grad_norm": 0.11936715245246887, "learning_rate": 0.0001, "loss": 0.1844, "step": 1122 }, { "epoch": 1.8307792631235735, "grad_norm": 0.10836116224527359, "learning_rate": 0.0001, "loss": 0.1826, "step": 1123 }, { "epoch": 1.8324095207042714, "grad_norm": 0.10425475239753723, "learning_rate": 0.0001, "loss": 0.1873, "step": 1124 }, { "epoch": 1.8340397782849691, "grad_norm": 0.12443797290325165, "learning_rate": 0.0001, "loss": 0.1775, "step": 1125 }, { "epoch": 1.8356700358656668, "grad_norm": 0.1314772516489029, "learning_rate": 0.0001, "loss": 0.1825, "step": 1126 }, { "epoch": 1.8373002934463645, "grad_norm": 0.1321101188659668, "learning_rate": 0.0001, "loss": 0.1775, "step": 1127 }, { "epoch": 1.8389305510270622, "grad_norm": 0.11217895895242691, "learning_rate": 0.0001, "loss": 0.1814, "step": 1128 }, { "epoch": 1.84056080860776, "grad_norm": 0.11161702871322632, "learning_rate": 0.0001, "loss": 0.1856, "step": 1129 }, { "epoch": 1.8421910661884577, "grad_norm": 0.11947924643754959, "learning_rate": 0.0001, "loss": 0.1818, "step": 1130 }, { "epoch": 1.8438213237691556, "grad_norm": 0.15216386318206787, "learning_rate": 0.0001, "loss": 0.18, "step": 1131 }, { "epoch": 1.8454515813498533, "grad_norm": 0.12306888401508331, "learning_rate": 0.0001, "loss": 0.1918, "step": 1132 }, { "epoch": 1.847081838930551, "grad_norm": 0.1321091651916504, "learning_rate": 0.0001, "loss": 0.1945, "step": 1133 }, { "epoch": 1.8487120965112487, "grad_norm": 0.12340245395898819, "learning_rate": 0.0001, "loss": 0.1853, "step": 1134 }, { "epoch": 1.8503423540919464, "grad_norm": 0.1099599152803421, "learning_rate": 0.0001, "loss": 0.1979, "step": 1135 }, { "epoch": 1.8519726116726443, "grad_norm": 0.13142207264900208, "learning_rate": 0.0001, "loss": 0.1811, "step": 1136 }, { "epoch": 1.8536028692533422, "grad_norm": 0.11548743396997452, "learning_rate": 0.0001, "loss": 0.1958, "step": 1137 }, { "epoch": 1.8552331268340398, "grad_norm": 0.1179068312048912, "learning_rate": 0.0001, "loss": 0.1836, "step": 1138 }, { "epoch": 1.8568633844147375, "grad_norm": 0.1258961409330368, "learning_rate": 0.0001, "loss": 0.1883, "step": 1139 }, { "epoch": 1.8584936419954352, "grad_norm": 0.11142059415578842, "learning_rate": 0.0001, "loss": 0.1921, "step": 1140 }, { "epoch": 1.8601238995761329, "grad_norm": 0.11219684034585953, "learning_rate": 0.0001, "loss": 0.1818, "step": 1141 }, { "epoch": 1.8617541571568308, "grad_norm": 0.11732007563114166, "learning_rate": 0.0001, "loss": 0.185, "step": 1142 }, { "epoch": 1.8633844147375287, "grad_norm": 0.10912680625915527, "learning_rate": 0.0001, "loss": 0.1823, "step": 1143 }, { "epoch": 1.8650146723182264, "grad_norm": 0.13096576929092407, "learning_rate": 0.0001, "loss": 0.1958, "step": 1144 }, { "epoch": 1.866644929898924, "grad_norm": 0.1521504819393158, "learning_rate": 0.0001, "loss": 0.1884, "step": 1145 }, { "epoch": 1.8682751874796217, "grad_norm": 0.12232112884521484, "learning_rate": 0.0001, "loss": 0.2035, "step": 1146 }, { "epoch": 1.8699054450603194, "grad_norm": 0.10666659474372864, "learning_rate": 0.0001, "loss": 0.1678, "step": 1147 }, { "epoch": 1.8715357026410173, "grad_norm": 0.12789608538150787, "learning_rate": 0.0001, "loss": 0.1705, "step": 1148 }, { "epoch": 1.873165960221715, "grad_norm": 0.1496013104915619, "learning_rate": 0.0001, "loss": 0.2019, "step": 1149 }, { "epoch": 1.874796217802413, "grad_norm": 0.1169729083776474, "learning_rate": 0.0001, "loss": 0.1706, "step": 1150 }, { "epoch": 1.8764264753831106, "grad_norm": 0.14244718849658966, "learning_rate": 0.0001, "loss": 0.1868, "step": 1151 }, { "epoch": 1.8780567329638083, "grad_norm": 0.12071932852268219, "learning_rate": 0.0001, "loss": 0.186, "step": 1152 }, { "epoch": 1.879686990544506, "grad_norm": 0.12490394711494446, "learning_rate": 0.0001, "loss": 0.1912, "step": 1153 }, { "epoch": 1.8813172481252036, "grad_norm": 0.13060052692890167, "learning_rate": 0.0001, "loss": 0.2214, "step": 1154 }, { "epoch": 1.8829475057059015, "grad_norm": 0.12722712755203247, "learning_rate": 0.0001, "loss": 0.1813, "step": 1155 }, { "epoch": 1.8845777632865994, "grad_norm": 0.12535281479358673, "learning_rate": 0.0001, "loss": 0.1961, "step": 1156 }, { "epoch": 1.886208020867297, "grad_norm": 0.11369742453098297, "learning_rate": 0.0001, "loss": 0.192, "step": 1157 }, { "epoch": 1.8878382784479948, "grad_norm": 0.1047392338514328, "learning_rate": 0.0001, "loss": 0.1765, "step": 1158 }, { "epoch": 1.8894685360286925, "grad_norm": 0.13275443017482758, "learning_rate": 0.0001, "loss": 0.1865, "step": 1159 }, { "epoch": 1.8910987936093901, "grad_norm": 0.11752204596996307, "learning_rate": 0.0001, "loss": 0.1886, "step": 1160 }, { "epoch": 1.892729051190088, "grad_norm": 0.11460895091295242, "learning_rate": 0.0001, "loss": 0.1898, "step": 1161 }, { "epoch": 1.8943593087707857, "grad_norm": 0.12889966368675232, "learning_rate": 0.0001, "loss": 0.1826, "step": 1162 }, { "epoch": 1.8959895663514836, "grad_norm": 0.11824796348810196, "learning_rate": 0.0001, "loss": 0.1775, "step": 1163 }, { "epoch": 1.8976198239321813, "grad_norm": 0.10617636889219284, "learning_rate": 0.0001, "loss": 0.1718, "step": 1164 }, { "epoch": 1.899250081512879, "grad_norm": 0.12249031662940979, "learning_rate": 0.0001, "loss": 0.1732, "step": 1165 }, { "epoch": 1.9008803390935767, "grad_norm": 0.1171279028058052, "learning_rate": 0.0001, "loss": 0.1837, "step": 1166 }, { "epoch": 1.9025105966742746, "grad_norm": 0.12532579898834229, "learning_rate": 0.0001, "loss": 0.1846, "step": 1167 }, { "epoch": 1.9041408542549723, "grad_norm": 0.13807226717472076, "learning_rate": 0.0001, "loss": 0.18, "step": 1168 }, { "epoch": 1.9057711118356702, "grad_norm": 0.1306944191455841, "learning_rate": 0.0001, "loss": 0.1849, "step": 1169 }, { "epoch": 1.9074013694163678, "grad_norm": 0.3306502401828766, "learning_rate": 0.0001, "loss": 0.2122, "step": 1170 }, { "epoch": 1.9090316269970655, "grad_norm": 0.1267193704843521, "learning_rate": 0.0001, "loss": 0.1893, "step": 1171 }, { "epoch": 1.9106618845777632, "grad_norm": 0.1224604994058609, "learning_rate": 0.0001, "loss": 0.1989, "step": 1172 }, { "epoch": 1.9122921421584609, "grad_norm": 0.11894525587558746, "learning_rate": 0.0001, "loss": 0.1731, "step": 1173 }, { "epoch": 1.9139223997391588, "grad_norm": 0.11913865804672241, "learning_rate": 0.0001, "loss": 0.1845, "step": 1174 }, { "epoch": 1.9155526573198567, "grad_norm": 0.11289380490779877, "learning_rate": 0.0001, "loss": 0.182, "step": 1175 }, { "epoch": 1.9171829149005544, "grad_norm": 0.13549114763736725, "learning_rate": 0.0001, "loss": 0.206, "step": 1176 }, { "epoch": 1.918813172481252, "grad_norm": 0.1213872879743576, "learning_rate": 0.0001, "loss": 0.1779, "step": 1177 }, { "epoch": 1.9204434300619497, "grad_norm": 0.13372060656547546, "learning_rate": 0.0001, "loss": 0.1761, "step": 1178 }, { "epoch": 1.9220736876426474, "grad_norm": 0.09771383553743362, "learning_rate": 0.0001, "loss": 0.1687, "step": 1179 }, { "epoch": 1.9237039452233453, "grad_norm": 0.12244701385498047, "learning_rate": 0.0001, "loss": 0.1827, "step": 1180 }, { "epoch": 1.925334202804043, "grad_norm": 0.1301904022693634, "learning_rate": 0.0001, "loss": 0.1814, "step": 1181 }, { "epoch": 1.926964460384741, "grad_norm": 0.14816220104694366, "learning_rate": 0.0001, "loss": 0.2002, "step": 1182 }, { "epoch": 1.9285947179654386, "grad_norm": 0.11727407574653625, "learning_rate": 0.0001, "loss": 0.1835, "step": 1183 }, { "epoch": 1.9302249755461363, "grad_norm": 0.13211950659751892, "learning_rate": 0.0001, "loss": 0.1859, "step": 1184 }, { "epoch": 1.931855233126834, "grad_norm": 0.12375782430171967, "learning_rate": 0.0001, "loss": 0.1837, "step": 1185 }, { "epoch": 1.9334854907075318, "grad_norm": 0.14515690505504608, "learning_rate": 0.0001, "loss": 0.1757, "step": 1186 }, { "epoch": 1.9351157482882295, "grad_norm": 0.11579223722219467, "learning_rate": 0.0001, "loss": 0.1767, "step": 1187 }, { "epoch": 1.9367460058689274, "grad_norm": 0.11580117791891098, "learning_rate": 0.0001, "loss": 0.1745, "step": 1188 }, { "epoch": 1.938376263449625, "grad_norm": 0.13860304653644562, "learning_rate": 0.0001, "loss": 0.1899, "step": 1189 }, { "epoch": 1.9400065210303228, "grad_norm": 0.10386091470718384, "learning_rate": 0.0001, "loss": 0.1799, "step": 1190 }, { "epoch": 1.9416367786110205, "grad_norm": 0.12089010328054428, "learning_rate": 0.0001, "loss": 0.1886, "step": 1191 }, { "epoch": 1.9432670361917181, "grad_norm": 0.13745586574077606, "learning_rate": 0.0001, "loss": 0.1865, "step": 1192 }, { "epoch": 1.944897293772416, "grad_norm": 0.12096656113862991, "learning_rate": 0.0001, "loss": 0.1929, "step": 1193 }, { "epoch": 1.946527551353114, "grad_norm": 0.24482771754264832, "learning_rate": 0.0001, "loss": 0.1911, "step": 1194 }, { "epoch": 1.9481578089338116, "grad_norm": 0.13980430364608765, "learning_rate": 0.0001, "loss": 0.1951, "step": 1195 }, { "epoch": 1.9497880665145093, "grad_norm": 0.1281464695930481, "learning_rate": 0.0001, "loss": 0.1797, "step": 1196 }, { "epoch": 1.951418324095207, "grad_norm": 0.11813049763441086, "learning_rate": 0.0001, "loss": 0.1836, "step": 1197 }, { "epoch": 1.9530485816759047, "grad_norm": 0.13674259185791016, "learning_rate": 0.0001, "loss": 0.1921, "step": 1198 }, { "epoch": 1.9546788392566026, "grad_norm": 0.12205852568149567, "learning_rate": 0.0001, "loss": 0.1726, "step": 1199 }, { "epoch": 1.9563090968373003, "grad_norm": 0.12858718633651733, "learning_rate": 0.0001, "loss": 0.1814, "step": 1200 }, { "epoch": 1.9579393544179982, "grad_norm": 0.1414395272731781, "learning_rate": 0.0001, "loss": 0.1791, "step": 1201 }, { "epoch": 1.9595696119986958, "grad_norm": 0.10766829550266266, "learning_rate": 0.0001, "loss": 0.1905, "step": 1202 }, { "epoch": 1.9611998695793935, "grad_norm": 0.12443807721138, "learning_rate": 0.0001, "loss": 0.1847, "step": 1203 }, { "epoch": 1.9628301271600912, "grad_norm": 0.1413409411907196, "learning_rate": 0.0001, "loss": 0.1943, "step": 1204 }, { "epoch": 1.9644603847407889, "grad_norm": 0.12135512381792068, "learning_rate": 0.0001, "loss": 0.1957, "step": 1205 }, { "epoch": 1.9660906423214868, "grad_norm": 0.15406377613544464, "learning_rate": 0.0001, "loss": 0.1799, "step": 1206 }, { "epoch": 1.9677208999021847, "grad_norm": 0.10838975757360458, "learning_rate": 0.0001, "loss": 0.1849, "step": 1207 }, { "epoch": 1.9693511574828824, "grad_norm": 0.12363716959953308, "learning_rate": 0.0001, "loss": 0.2111, "step": 1208 }, { "epoch": 1.97098141506358, "grad_norm": 0.10429894924163818, "learning_rate": 0.0001, "loss": 0.1778, "step": 1209 }, { "epoch": 1.9726116726442777, "grad_norm": 0.11979079246520996, "learning_rate": 0.0001, "loss": 0.197, "step": 1210 }, { "epoch": 1.9742419302249754, "grad_norm": 0.10798018425703049, "learning_rate": 0.0001, "loss": 0.1697, "step": 1211 }, { "epoch": 1.9758721878056733, "grad_norm": 0.11077526211738586, "learning_rate": 0.0001, "loss": 0.1881, "step": 1212 }, { "epoch": 1.9775024453863712, "grad_norm": 0.13390393555164337, "learning_rate": 0.0001, "loss": 0.1909, "step": 1213 }, { "epoch": 1.979132702967069, "grad_norm": 0.11331623047590256, "learning_rate": 0.0001, "loss": 0.1869, "step": 1214 }, { "epoch": 1.9807629605477666, "grad_norm": 0.12054945528507233, "learning_rate": 0.0001, "loss": 0.2016, "step": 1215 }, { "epoch": 1.9823932181284643, "grad_norm": 0.11420212686061859, "learning_rate": 0.0001, "loss": 0.1714, "step": 1216 }, { "epoch": 1.984023475709162, "grad_norm": 0.1471630334854126, "learning_rate": 0.0001, "loss": 0.1991, "step": 1217 }, { "epoch": 1.9856537332898598, "grad_norm": 0.11184452474117279, "learning_rate": 0.0001, "loss": 0.1728, "step": 1218 }, { "epoch": 1.9872839908705575, "grad_norm": 0.12244154512882233, "learning_rate": 0.0001, "loss": 0.1725, "step": 1219 }, { "epoch": 1.9889142484512554, "grad_norm": 0.11920984834432602, "learning_rate": 0.0001, "loss": 0.1746, "step": 1220 }, { "epoch": 1.990544506031953, "grad_norm": 0.14389824867248535, "learning_rate": 0.0001, "loss": 0.192, "step": 1221 }, { "epoch": 1.9921747636126508, "grad_norm": 0.13001768290996552, "learning_rate": 0.0001, "loss": 0.1988, "step": 1222 }, { "epoch": 1.9938050211933485, "grad_norm": 0.2044341266155243, "learning_rate": 0.0001, "loss": 0.1783, "step": 1223 }, { "epoch": 1.9954352787740461, "grad_norm": 0.11807113885879517, "learning_rate": 0.0001, "loss": 0.1748, "step": 1224 }, { "epoch": 1.997065536354744, "grad_norm": 0.12389995157718658, "learning_rate": 0.0001, "loss": 0.1968, "step": 1225 }, { "epoch": 1.998695793935442, "grad_norm": 0.11298985034227371, "learning_rate": 0.0001, "loss": 0.1781, "step": 1226 }, { "epoch": 2.0003260515161396, "grad_norm": 0.15368470549583435, "learning_rate": 0.0001, "loss": 0.1889, "step": 1227 }, { "epoch": 2.0019563090968373, "grad_norm": 0.13915176689624786, "learning_rate": 0.0001, "loss": 0.1791, "step": 1228 }, { "epoch": 2.003586566677535, "grad_norm": 0.13117247819900513, "learning_rate": 0.0001, "loss": 0.1753, "step": 1229 }, { "epoch": 2.0052168242582327, "grad_norm": 0.1464276909828186, "learning_rate": 0.0001, "loss": 0.1773, "step": 1230 }, { "epoch": 2.0068470818389303, "grad_norm": 0.1339617669582367, "learning_rate": 0.0001, "loss": 0.1857, "step": 1231 }, { "epoch": 2.0084773394196285, "grad_norm": 0.12631379067897797, "learning_rate": 0.0001, "loss": 0.1711, "step": 1232 }, { "epoch": 2.010107597000326, "grad_norm": 0.1561838537454605, "learning_rate": 0.0001, "loss": 0.173, "step": 1233 }, { "epoch": 2.011737854581024, "grad_norm": 0.13479989767074585, "learning_rate": 0.0001, "loss": 0.1661, "step": 1234 }, { "epoch": 2.0133681121617215, "grad_norm": 0.14471793174743652, "learning_rate": 0.0001, "loss": 0.1712, "step": 1235 }, { "epoch": 2.014998369742419, "grad_norm": 0.13273389637470245, "learning_rate": 0.0001, "loss": 0.1689, "step": 1236 }, { "epoch": 2.016628627323117, "grad_norm": 0.15367534756660461, "learning_rate": 0.0001, "loss": 0.1917, "step": 1237 }, { "epoch": 2.018258884903815, "grad_norm": 0.14998719096183777, "learning_rate": 0.0001, "loss": 0.1683, "step": 1238 }, { "epoch": 2.0198891424845127, "grad_norm": 0.12296893447637558, "learning_rate": 0.0001, "loss": 0.1604, "step": 1239 }, { "epoch": 2.0215194000652104, "grad_norm": 0.1817280501127243, "learning_rate": 0.0001, "loss": 0.1783, "step": 1240 }, { "epoch": 2.023149657645908, "grad_norm": 0.13128723204135895, "learning_rate": 0.0001, "loss": 0.1664, "step": 1241 }, { "epoch": 2.0247799152266057, "grad_norm": 0.10108043253421783, "learning_rate": 0.0001, "loss": 0.1612, "step": 1242 }, { "epoch": 2.0264101728073034, "grad_norm": 0.12584680318832397, "learning_rate": 0.0001, "loss": 0.1769, "step": 1243 }, { "epoch": 2.0280404303880015, "grad_norm": 0.11849281191825867, "learning_rate": 0.0001, "loss": 0.1684, "step": 1244 }, { "epoch": 2.029670687968699, "grad_norm": 0.15767361223697662, "learning_rate": 0.0001, "loss": 0.172, "step": 1245 }, { "epoch": 2.031300945549397, "grad_norm": 0.13649272918701172, "learning_rate": 0.0001, "loss": 0.1777, "step": 1246 }, { "epoch": 2.0329312031300946, "grad_norm": 0.12619361281394958, "learning_rate": 0.0001, "loss": 0.1843, "step": 1247 }, { "epoch": 2.0345614607107922, "grad_norm": 0.12181363254785538, "learning_rate": 0.0001, "loss": 0.1651, "step": 1248 }, { "epoch": 2.03619171829149, "grad_norm": 0.14900757372379303, "learning_rate": 0.0001, "loss": 0.1777, "step": 1249 }, { "epoch": 2.0378219758721876, "grad_norm": 0.13565577566623688, "learning_rate": 0.0001, "loss": 0.1797, "step": 1250 }, { "epoch": 2.0394522334528857, "grad_norm": 0.11237785965204239, "learning_rate": 0.0001, "loss": 0.1577, "step": 1251 }, { "epoch": 2.0410824910335834, "grad_norm": 0.14608171582221985, "learning_rate": 0.0001, "loss": 0.1792, "step": 1252 }, { "epoch": 2.042712748614281, "grad_norm": 0.14351628720760345, "learning_rate": 0.0001, "loss": 0.1848, "step": 1253 }, { "epoch": 2.0443430061949788, "grad_norm": 0.13642996549606323, "learning_rate": 0.0001, "loss": 0.1919, "step": 1254 }, { "epoch": 2.0459732637756765, "grad_norm": 0.13869710266590118, "learning_rate": 0.0001, "loss": 0.1739, "step": 1255 }, { "epoch": 2.047603521356374, "grad_norm": 0.12874935567378998, "learning_rate": 0.0001, "loss": 0.1785, "step": 1256 }, { "epoch": 2.0492337789370723, "grad_norm": 0.12782157957553864, "learning_rate": 0.0001, "loss": 0.1802, "step": 1257 }, { "epoch": 2.05086403651777, "grad_norm": 0.12892591953277588, "learning_rate": 0.0001, "loss": 0.1704, "step": 1258 }, { "epoch": 2.0524942940984676, "grad_norm": 0.1089789867401123, "learning_rate": 0.0001, "loss": 0.1578, "step": 1259 }, { "epoch": 2.0541245516791653, "grad_norm": 0.1354093849658966, "learning_rate": 0.0001, "loss": 0.1675, "step": 1260 }, { "epoch": 2.055754809259863, "grad_norm": 0.13709183037281036, "learning_rate": 0.0001, "loss": 0.1735, "step": 1261 }, { "epoch": 2.0573850668405607, "grad_norm": 0.14396348595619202, "learning_rate": 0.0001, "loss": 0.1815, "step": 1262 }, { "epoch": 2.0590153244212583, "grad_norm": 0.1237788200378418, "learning_rate": 0.0001, "loss": 0.1565, "step": 1263 }, { "epoch": 2.0606455820019565, "grad_norm": 0.13664290308952332, "learning_rate": 0.0001, "loss": 0.1786, "step": 1264 }, { "epoch": 2.062275839582654, "grad_norm": 0.16328440606594086, "learning_rate": 0.0001, "loss": 0.1598, "step": 1265 }, { "epoch": 2.063906097163352, "grad_norm": 0.1353476196527481, "learning_rate": 0.0001, "loss": 0.1659, "step": 1266 }, { "epoch": 2.0655363547440495, "grad_norm": 0.16309651732444763, "learning_rate": 0.0001, "loss": 0.1999, "step": 1267 }, { "epoch": 2.067166612324747, "grad_norm": 0.13841314613819122, "learning_rate": 0.0001, "loss": 0.1974, "step": 1268 }, { "epoch": 2.068796869905445, "grad_norm": 0.12356504052877426, "learning_rate": 0.0001, "loss": 0.1695, "step": 1269 }, { "epoch": 2.070427127486143, "grad_norm": 0.13916341960430145, "learning_rate": 0.0001, "loss": 0.165, "step": 1270 }, { "epoch": 2.0720573850668407, "grad_norm": 0.13832920789718628, "learning_rate": 0.0001, "loss": 0.1619, "step": 1271 }, { "epoch": 2.0736876426475384, "grad_norm": 0.14106802642345428, "learning_rate": 0.0001, "loss": 0.1749, "step": 1272 }, { "epoch": 2.075317900228236, "grad_norm": 0.12422151118516922, "learning_rate": 0.0001, "loss": 0.1718, "step": 1273 }, { "epoch": 2.0769481578089337, "grad_norm": 0.14482441544532776, "learning_rate": 0.0001, "loss": 0.1807, "step": 1274 }, { "epoch": 2.0785784153896314, "grad_norm": 0.11608091741800308, "learning_rate": 0.0001, "loss": 0.1759, "step": 1275 }, { "epoch": 2.0802086729703295, "grad_norm": 0.12500539422035217, "learning_rate": 0.0001, "loss": 0.1542, "step": 1276 }, { "epoch": 2.081838930551027, "grad_norm": 0.13771778345108032, "learning_rate": 0.0001, "loss": 0.1899, "step": 1277 }, { "epoch": 2.083469188131725, "grad_norm": 0.12829305231571198, "learning_rate": 0.0001, "loss": 0.1617, "step": 1278 }, { "epoch": 2.0850994457124226, "grad_norm": 0.12664735317230225, "learning_rate": 0.0001, "loss": 0.1756, "step": 1279 }, { "epoch": 2.0867297032931202, "grad_norm": 0.14798226952552795, "learning_rate": 0.0001, "loss": 0.175, "step": 1280 }, { "epoch": 2.088359960873818, "grad_norm": 0.13553886115550995, "learning_rate": 0.0001, "loss": 0.1745, "step": 1281 }, { "epoch": 2.0899902184545156, "grad_norm": 0.14772772789001465, "learning_rate": 0.0001, "loss": 0.1727, "step": 1282 }, { "epoch": 2.0916204760352137, "grad_norm": 0.14489415287971497, "learning_rate": 0.0001, "loss": 0.1595, "step": 1283 }, { "epoch": 2.0932507336159114, "grad_norm": 0.14262838661670685, "learning_rate": 0.0001, "loss": 0.1735, "step": 1284 }, { "epoch": 2.094880991196609, "grad_norm": 0.13312998414039612, "learning_rate": 0.0001, "loss": 0.172, "step": 1285 }, { "epoch": 2.0965112487773068, "grad_norm": 0.1361507624387741, "learning_rate": 0.0001, "loss": 0.1679, "step": 1286 }, { "epoch": 2.0981415063580044, "grad_norm": 0.13537771999835968, "learning_rate": 0.0001, "loss": 0.1656, "step": 1287 }, { "epoch": 2.099771763938702, "grad_norm": 0.16186851263046265, "learning_rate": 0.0001, "loss": 0.1786, "step": 1288 }, { "epoch": 2.1014020215194003, "grad_norm": 0.1386481523513794, "learning_rate": 0.0001, "loss": 0.1745, "step": 1289 }, { "epoch": 2.103032279100098, "grad_norm": 0.12809383869171143, "learning_rate": 0.0001, "loss": 0.1676, "step": 1290 }, { "epoch": 2.1046625366807956, "grad_norm": 0.12493950873613358, "learning_rate": 0.0001, "loss": 0.1634, "step": 1291 }, { "epoch": 2.1062927942614933, "grad_norm": 0.12215922027826309, "learning_rate": 0.0001, "loss": 0.1633, "step": 1292 }, { "epoch": 2.107923051842191, "grad_norm": 0.14522279798984528, "learning_rate": 0.0001, "loss": 0.1785, "step": 1293 }, { "epoch": 2.1095533094228887, "grad_norm": 0.16697941720485687, "learning_rate": 0.0001, "loss": 0.1714, "step": 1294 }, { "epoch": 2.111183567003587, "grad_norm": 0.12275785952806473, "learning_rate": 0.0001, "loss": 0.174, "step": 1295 }, { "epoch": 2.1128138245842845, "grad_norm": 0.12233259528875351, "learning_rate": 0.0001, "loss": 0.1639, "step": 1296 }, { "epoch": 2.114444082164982, "grad_norm": 0.145602285861969, "learning_rate": 0.0001, "loss": 0.1725, "step": 1297 }, { "epoch": 2.11607433974568, "grad_norm": 0.18430952727794647, "learning_rate": 0.0001, "loss": 0.1617, "step": 1298 }, { "epoch": 2.1177045973263775, "grad_norm": 0.12870638072490692, "learning_rate": 0.0001, "loss": 0.1741, "step": 1299 }, { "epoch": 2.119334854907075, "grad_norm": 0.1242339164018631, "learning_rate": 0.0001, "loss": 0.1614, "step": 1300 }, { "epoch": 2.120965112487773, "grad_norm": 0.15219736099243164, "learning_rate": 0.0001, "loss": 0.1759, "step": 1301 }, { "epoch": 2.122595370068471, "grad_norm": 0.14123129844665527, "learning_rate": 0.0001, "loss": 0.1637, "step": 1302 }, { "epoch": 2.1242256276491687, "grad_norm": 0.11224009841680527, "learning_rate": 0.0001, "loss": 0.1586, "step": 1303 }, { "epoch": 2.1258558852298663, "grad_norm": 0.1388261467218399, "learning_rate": 0.0001, "loss": 0.1607, "step": 1304 }, { "epoch": 2.127486142810564, "grad_norm": 0.13148367404937744, "learning_rate": 0.0001, "loss": 0.1894, "step": 1305 }, { "epoch": 2.1291164003912617, "grad_norm": 0.1266559213399887, "learning_rate": 0.0001, "loss": 0.1582, "step": 1306 }, { "epoch": 2.1307466579719594, "grad_norm": 0.15286579728126526, "learning_rate": 0.0001, "loss": 0.1852, "step": 1307 }, { "epoch": 2.1323769155526575, "grad_norm": 0.1486169546842575, "learning_rate": 0.0001, "loss": 0.1673, "step": 1308 }, { "epoch": 2.134007173133355, "grad_norm": 0.1324825882911682, "learning_rate": 0.0001, "loss": 0.1745, "step": 1309 }, { "epoch": 2.135637430714053, "grad_norm": 0.12974712252616882, "learning_rate": 0.0001, "loss": 0.1786, "step": 1310 }, { "epoch": 2.1372676882947506, "grad_norm": 0.17382071912288666, "learning_rate": 0.0001, "loss": 0.1659, "step": 1311 }, { "epoch": 2.1388979458754482, "grad_norm": 0.14870017766952515, "learning_rate": 0.0001, "loss": 0.1827, "step": 1312 }, { "epoch": 2.140528203456146, "grad_norm": 0.1424129158258438, "learning_rate": 0.0001, "loss": 0.1813, "step": 1313 }, { "epoch": 2.1421584610368436, "grad_norm": 0.15458005666732788, "learning_rate": 0.0001, "loss": 0.1705, "step": 1314 }, { "epoch": 2.1437887186175417, "grad_norm": 0.1291740983724594, "learning_rate": 0.0001, "loss": 0.1563, "step": 1315 }, { "epoch": 2.1454189761982394, "grad_norm": 0.14298751950263977, "learning_rate": 0.0001, "loss": 0.176, "step": 1316 }, { "epoch": 2.147049233778937, "grad_norm": 0.1237107664346695, "learning_rate": 0.0001, "loss": 0.1636, "step": 1317 }, { "epoch": 2.1486794913596348, "grad_norm": 0.11446285992860794, "learning_rate": 0.0001, "loss": 0.1658, "step": 1318 }, { "epoch": 2.1503097489403324, "grad_norm": 0.11239829659461975, "learning_rate": 0.0001, "loss": 0.1606, "step": 1319 }, { "epoch": 2.15194000652103, "grad_norm": 0.12736567854881287, "learning_rate": 0.0001, "loss": 0.1778, "step": 1320 }, { "epoch": 2.1535702641017282, "grad_norm": 0.13921810686588287, "learning_rate": 0.0001, "loss": 0.1882, "step": 1321 }, { "epoch": 2.155200521682426, "grad_norm": 0.12388879060745239, "learning_rate": 0.0001, "loss": 0.1824, "step": 1322 }, { "epoch": 2.1568307792631236, "grad_norm": 0.1522189825773239, "learning_rate": 0.0001, "loss": 0.1556, "step": 1323 }, { "epoch": 2.1584610368438213, "grad_norm": 0.13830043375492096, "learning_rate": 0.0001, "loss": 0.1706, "step": 1324 }, { "epoch": 2.160091294424519, "grad_norm": 0.14144088327884674, "learning_rate": 0.0001, "loss": 0.1699, "step": 1325 }, { "epoch": 2.1617215520052167, "grad_norm": 0.14037571847438812, "learning_rate": 0.0001, "loss": 0.1673, "step": 1326 }, { "epoch": 2.1633518095859148, "grad_norm": 0.16178935766220093, "learning_rate": 0.0001, "loss": 0.1881, "step": 1327 }, { "epoch": 2.1649820671666125, "grad_norm": 0.12526622414588928, "learning_rate": 0.0001, "loss": 0.1675, "step": 1328 }, { "epoch": 2.16661232474731, "grad_norm": 0.1448407918214798, "learning_rate": 0.0001, "loss": 0.1731, "step": 1329 }, { "epoch": 2.168242582328008, "grad_norm": 0.15621818602085114, "learning_rate": 0.0001, "loss": 0.1764, "step": 1330 }, { "epoch": 2.1698728399087055, "grad_norm": 0.14016954600811005, "learning_rate": 0.0001, "loss": 0.1752, "step": 1331 }, { "epoch": 2.171503097489403, "grad_norm": 0.15519127249717712, "learning_rate": 0.0001, "loss": 0.1701, "step": 1332 }, { "epoch": 2.1731333550701013, "grad_norm": 0.12022172659635544, "learning_rate": 0.0001, "loss": 0.1639, "step": 1333 }, { "epoch": 2.174763612650799, "grad_norm": 0.1351272314786911, "learning_rate": 0.0001, "loss": 0.1653, "step": 1334 }, { "epoch": 2.1763938702314967, "grad_norm": 0.13195279240608215, "learning_rate": 0.0001, "loss": 0.1735, "step": 1335 }, { "epoch": 2.1780241278121943, "grad_norm": 0.14435507357120514, "learning_rate": 0.0001, "loss": 0.1801, "step": 1336 }, { "epoch": 2.179654385392892, "grad_norm": 0.13603608310222626, "learning_rate": 0.0001, "loss": 0.1643, "step": 1337 }, { "epoch": 2.1812846429735897, "grad_norm": 0.14588388800621033, "learning_rate": 0.0001, "loss": 0.1711, "step": 1338 }, { "epoch": 2.1829149005542874, "grad_norm": 0.13602250814437866, "learning_rate": 0.0001, "loss": 0.1854, "step": 1339 }, { "epoch": 2.1845451581349855, "grad_norm": 0.13646137714385986, "learning_rate": 0.0001, "loss": 0.1881, "step": 1340 }, { "epoch": 2.186175415715683, "grad_norm": 0.13280268013477325, "learning_rate": 0.0001, "loss": 0.1708, "step": 1341 }, { "epoch": 2.187805673296381, "grad_norm": 0.13370732963085175, "learning_rate": 0.0001, "loss": 0.1667, "step": 1342 }, { "epoch": 2.1894359308770786, "grad_norm": 0.14326944947242737, "learning_rate": 0.0001, "loss": 0.1686, "step": 1343 }, { "epoch": 2.1910661884577762, "grad_norm": 0.12916752696037292, "learning_rate": 0.0001, "loss": 0.1779, "step": 1344 }, { "epoch": 2.192696446038474, "grad_norm": 0.14404352009296417, "learning_rate": 0.0001, "loss": 0.171, "step": 1345 }, { "epoch": 2.194326703619172, "grad_norm": 0.12311873584985733, "learning_rate": 0.0001, "loss": 0.1735, "step": 1346 }, { "epoch": 2.1959569611998697, "grad_norm": 0.15599961578845978, "learning_rate": 0.0001, "loss": 0.1748, "step": 1347 }, { "epoch": 2.1975872187805674, "grad_norm": 0.12893660366535187, "learning_rate": 0.0001, "loss": 0.1829, "step": 1348 }, { "epoch": 2.199217476361265, "grad_norm": 0.12859125435352325, "learning_rate": 0.0001, "loss": 0.1797, "step": 1349 }, { "epoch": 2.2008477339419628, "grad_norm": 0.1159624233841896, "learning_rate": 0.0001, "loss": 0.1658, "step": 1350 }, { "epoch": 2.2024779915226604, "grad_norm": 0.13532379269599915, "learning_rate": 0.0001, "loss": 0.1973, "step": 1351 }, { "epoch": 2.204108249103358, "grad_norm": 0.142998605966568, "learning_rate": 0.0001, "loss": 0.1763, "step": 1352 }, { "epoch": 2.2057385066840562, "grad_norm": 0.15357817709445953, "learning_rate": 0.0001, "loss": 0.1733, "step": 1353 }, { "epoch": 2.207368764264754, "grad_norm": 0.133555606007576, "learning_rate": 0.0001, "loss": 0.177, "step": 1354 }, { "epoch": 2.2089990218454516, "grad_norm": 0.12514221668243408, "learning_rate": 0.0001, "loss": 0.1706, "step": 1355 }, { "epoch": 2.2106292794261493, "grad_norm": 0.12143199145793915, "learning_rate": 0.0001, "loss": 0.1728, "step": 1356 }, { "epoch": 2.212259537006847, "grad_norm": 0.12493051588535309, "learning_rate": 0.0001, "loss": 0.1735, "step": 1357 }, { "epoch": 2.2138897945875446, "grad_norm": 0.12735562026500702, "learning_rate": 0.0001, "loss": 0.1692, "step": 1358 }, { "epoch": 2.2155200521682428, "grad_norm": 0.15920554101467133, "learning_rate": 0.0001, "loss": 0.1946, "step": 1359 }, { "epoch": 2.2171503097489405, "grad_norm": 0.12582500278949738, "learning_rate": 0.0001, "loss": 0.1585, "step": 1360 }, { "epoch": 2.218780567329638, "grad_norm": 0.16344155371189117, "learning_rate": 0.0001, "loss": 0.1666, "step": 1361 }, { "epoch": 2.220410824910336, "grad_norm": 0.12184584140777588, "learning_rate": 0.0001, "loss": 0.1804, "step": 1362 }, { "epoch": 2.2220410824910335, "grad_norm": 0.16029442846775055, "learning_rate": 0.0001, "loss": 0.1911, "step": 1363 }, { "epoch": 2.223671340071731, "grad_norm": 0.12164244800806046, "learning_rate": 0.0001, "loss": 0.1907, "step": 1364 }, { "epoch": 2.225301597652429, "grad_norm": 0.15586431324481964, "learning_rate": 0.0001, "loss": 0.1762, "step": 1365 }, { "epoch": 2.226931855233127, "grad_norm": 0.1489764153957367, "learning_rate": 0.0001, "loss": 0.1857, "step": 1366 }, { "epoch": 2.2285621128138247, "grad_norm": 0.153330996632576, "learning_rate": 0.0001, "loss": 0.1756, "step": 1367 }, { "epoch": 2.2301923703945223, "grad_norm": 0.11781416088342667, "learning_rate": 0.0001, "loss": 0.1556, "step": 1368 }, { "epoch": 2.23182262797522, "grad_norm": 0.1183483824133873, "learning_rate": 0.0001, "loss": 0.1564, "step": 1369 }, { "epoch": 2.2334528855559177, "grad_norm": 0.12061773240566254, "learning_rate": 0.0001, "loss": 0.1717, "step": 1370 }, { "epoch": 2.2350831431366154, "grad_norm": 0.151438370347023, "learning_rate": 0.0001, "loss": 0.1794, "step": 1371 }, { "epoch": 2.2367134007173135, "grad_norm": 0.12716467678546906, "learning_rate": 0.0001, "loss": 0.1778, "step": 1372 }, { "epoch": 2.238343658298011, "grad_norm": 0.13353648781776428, "learning_rate": 0.0001, "loss": 0.181, "step": 1373 }, { "epoch": 2.239973915878709, "grad_norm": 0.12627844512462616, "learning_rate": 0.0001, "loss": 0.1714, "step": 1374 }, { "epoch": 2.2416041734594065, "grad_norm": 0.14614202082157135, "learning_rate": 0.0001, "loss": 0.1751, "step": 1375 }, { "epoch": 2.2432344310401042, "grad_norm": 0.12322308868169785, "learning_rate": 0.0001, "loss": 0.1688, "step": 1376 }, { "epoch": 2.244864688620802, "grad_norm": 0.1395500749349594, "learning_rate": 0.0001, "loss": 0.1777, "step": 1377 }, { "epoch": 2.2464949462015, "grad_norm": 0.12420187145471573, "learning_rate": 0.0001, "loss": 0.1617, "step": 1378 }, { "epoch": 2.2481252037821977, "grad_norm": 0.1447030007839203, "learning_rate": 0.0001, "loss": 0.1794, "step": 1379 }, { "epoch": 2.2497554613628954, "grad_norm": 0.1566443145275116, "learning_rate": 0.0001, "loss": 0.1863, "step": 1380 }, { "epoch": 2.251385718943593, "grad_norm": 0.1401456892490387, "learning_rate": 0.0001, "loss": 0.1822, "step": 1381 }, { "epoch": 2.2530159765242908, "grad_norm": 0.13101978600025177, "learning_rate": 0.0001, "loss": 0.1672, "step": 1382 }, { "epoch": 2.2546462341049884, "grad_norm": 0.12039732187986374, "learning_rate": 0.0001, "loss": 0.1679, "step": 1383 }, { "epoch": 2.2562764916856866, "grad_norm": 0.12393520027399063, "learning_rate": 0.0001, "loss": 0.1728, "step": 1384 }, { "epoch": 2.2579067492663842, "grad_norm": 0.16083435714244843, "learning_rate": 0.0001, "loss": 0.1709, "step": 1385 }, { "epoch": 2.259537006847082, "grad_norm": 0.13336148858070374, "learning_rate": 0.0001, "loss": 0.1675, "step": 1386 }, { "epoch": 2.2611672644277796, "grad_norm": 0.14768260717391968, "learning_rate": 0.0001, "loss": 0.1747, "step": 1387 }, { "epoch": 2.2627975220084773, "grad_norm": 0.13280175626277924, "learning_rate": 0.0001, "loss": 0.1616, "step": 1388 }, { "epoch": 2.264427779589175, "grad_norm": 0.12707430124282837, "learning_rate": 0.0001, "loss": 0.1687, "step": 1389 }, { "epoch": 2.2660580371698726, "grad_norm": 0.14237570762634277, "learning_rate": 0.0001, "loss": 0.1598, "step": 1390 }, { "epoch": 2.2676882947505708, "grad_norm": 0.1464899331331253, "learning_rate": 0.0001, "loss": 0.1795, "step": 1391 }, { "epoch": 2.2693185523312684, "grad_norm": 0.15607094764709473, "learning_rate": 0.0001, "loss": 0.1775, "step": 1392 }, { "epoch": 2.270948809911966, "grad_norm": 0.1322450190782547, "learning_rate": 0.0001, "loss": 0.1729, "step": 1393 }, { "epoch": 2.272579067492664, "grad_norm": 0.17985309660434723, "learning_rate": 0.0001, "loss": 0.1649, "step": 1394 }, { "epoch": 2.2742093250733615, "grad_norm": 0.1275913417339325, "learning_rate": 0.0001, "loss": 0.184, "step": 1395 }, { "epoch": 2.275839582654059, "grad_norm": 0.1359567791223526, "learning_rate": 0.0001, "loss": 0.1807, "step": 1396 }, { "epoch": 2.2774698402347573, "grad_norm": 0.13110549747943878, "learning_rate": 0.0001, "loss": 0.1736, "step": 1397 }, { "epoch": 2.279100097815455, "grad_norm": 0.13743306696414948, "learning_rate": 0.0001, "loss": 0.1657, "step": 1398 }, { "epoch": 2.2807303553961527, "grad_norm": 0.15267515182495117, "learning_rate": 0.0001, "loss": 0.1674, "step": 1399 }, { "epoch": 2.2823606129768503, "grad_norm": 0.12119441479444504, "learning_rate": 0.0001, "loss": 0.1627, "step": 1400 }, { "epoch": 2.283990870557548, "grad_norm": 0.1421733945608139, "learning_rate": 0.0001, "loss": 0.161, "step": 1401 }, { "epoch": 2.2856211281382457, "grad_norm": 0.1504882127046585, "learning_rate": 0.0001, "loss": 0.1869, "step": 1402 }, { "epoch": 2.2872513857189434, "grad_norm": 0.15724492073059082, "learning_rate": 0.0001, "loss": 0.1826, "step": 1403 }, { "epoch": 2.2888816432996415, "grad_norm": 0.1295897364616394, "learning_rate": 0.0001, "loss": 0.1697, "step": 1404 }, { "epoch": 2.290511900880339, "grad_norm": 0.13183419406414032, "learning_rate": 0.0001, "loss": 0.1675, "step": 1405 }, { "epoch": 2.292142158461037, "grad_norm": 0.14537960290908813, "learning_rate": 0.0001, "loss": 0.1912, "step": 1406 }, { "epoch": 2.2937724160417345, "grad_norm": 0.1461685299873352, "learning_rate": 0.0001, "loss": 0.1815, "step": 1407 }, { "epoch": 2.295402673622432, "grad_norm": 0.13184034824371338, "learning_rate": 0.0001, "loss": 0.1764, "step": 1408 }, { "epoch": 2.29703293120313, "grad_norm": 0.15598301589488983, "learning_rate": 0.0001, "loss": 0.1688, "step": 1409 }, { "epoch": 2.298663188783828, "grad_norm": 0.17276427149772644, "learning_rate": 0.0001, "loss": 0.1794, "step": 1410 }, { "epoch": 2.3002934463645257, "grad_norm": 0.12554508447647095, "learning_rate": 0.0001, "loss": 0.1788, "step": 1411 }, { "epoch": 2.3019237039452234, "grad_norm": 0.12981827557086945, "learning_rate": 0.0001, "loss": 0.1658, "step": 1412 }, { "epoch": 2.303553961525921, "grad_norm": 0.10513079911470413, "learning_rate": 0.0001, "loss": 0.1587, "step": 1413 }, { "epoch": 2.3051842191066187, "grad_norm": 0.12070135772228241, "learning_rate": 0.0001, "loss": 0.1652, "step": 1414 }, { "epoch": 2.3068144766873164, "grad_norm": 0.11308653652667999, "learning_rate": 0.0001, "loss": 0.1609, "step": 1415 }, { "epoch": 2.308444734268014, "grad_norm": 0.1290631741285324, "learning_rate": 0.0001, "loss": 0.179, "step": 1416 }, { "epoch": 2.3100749918487122, "grad_norm": 0.15130403637886047, "learning_rate": 0.0001, "loss": 0.192, "step": 1417 }, { "epoch": 2.31170524942941, "grad_norm": 0.12521226704120636, "learning_rate": 0.0001, "loss": 0.1743, "step": 1418 }, { "epoch": 2.3133355070101076, "grad_norm": 0.15238836407661438, "learning_rate": 0.0001, "loss": 0.1806, "step": 1419 }, { "epoch": 2.3149657645908053, "grad_norm": 0.15979474782943726, "learning_rate": 0.0001, "loss": 0.1987, "step": 1420 }, { "epoch": 2.316596022171503, "grad_norm": 0.1352485716342926, "learning_rate": 0.0001, "loss": 0.1868, "step": 1421 }, { "epoch": 2.318226279752201, "grad_norm": 0.12670114636421204, "learning_rate": 0.0001, "loss": 0.1807, "step": 1422 }, { "epoch": 2.3198565373328988, "grad_norm": 0.13426585495471954, "learning_rate": 0.0001, "loss": 0.1739, "step": 1423 }, { "epoch": 2.3214867949135964, "grad_norm": 0.1362699568271637, "learning_rate": 0.0001, "loss": 0.1839, "step": 1424 }, { "epoch": 2.323117052494294, "grad_norm": 0.12018754333257675, "learning_rate": 0.0001, "loss": 0.1607, "step": 1425 }, { "epoch": 2.324747310074992, "grad_norm": 0.13709640502929688, "learning_rate": 0.0001, "loss": 0.1784, "step": 1426 }, { "epoch": 2.3263775676556895, "grad_norm": 0.12608668208122253, "learning_rate": 0.0001, "loss": 0.1727, "step": 1427 }, { "epoch": 2.328007825236387, "grad_norm": 0.13371500372886658, "learning_rate": 0.0001, "loss": 0.1669, "step": 1428 }, { "epoch": 2.3296380828170853, "grad_norm": 0.13639819622039795, "learning_rate": 0.0001, "loss": 0.1755, "step": 1429 }, { "epoch": 2.331268340397783, "grad_norm": 0.13079668581485748, "learning_rate": 0.0001, "loss": 0.1843, "step": 1430 }, { "epoch": 2.3328985979784806, "grad_norm": 0.11506593972444534, "learning_rate": 0.0001, "loss": 0.1596, "step": 1431 }, { "epoch": 2.3345288555591783, "grad_norm": 0.15789979696273804, "learning_rate": 0.0001, "loss": 0.1814, "step": 1432 }, { "epoch": 2.336159113139876, "grad_norm": 0.1199864000082016, "learning_rate": 0.0001, "loss": 0.1635, "step": 1433 }, { "epoch": 2.3377893707205737, "grad_norm": 0.14094781875610352, "learning_rate": 0.0001, "loss": 0.184, "step": 1434 }, { "epoch": 2.339419628301272, "grad_norm": 0.13015292584896088, "learning_rate": 0.0001, "loss": 0.1643, "step": 1435 }, { "epoch": 2.3410498858819695, "grad_norm": 0.13332228362560272, "learning_rate": 0.0001, "loss": 0.1596, "step": 1436 }, { "epoch": 2.342680143462667, "grad_norm": 0.11775743216276169, "learning_rate": 0.0001, "loss": 0.1783, "step": 1437 }, { "epoch": 2.344310401043365, "grad_norm": 0.15774385631084442, "learning_rate": 0.0001, "loss": 0.188, "step": 1438 }, { "epoch": 2.3459406586240625, "grad_norm": 0.12837542593479156, "learning_rate": 0.0001, "loss": 0.1666, "step": 1439 }, { "epoch": 2.34757091620476, "grad_norm": 0.14266343414783478, "learning_rate": 0.0001, "loss": 0.173, "step": 1440 }, { "epoch": 2.349201173785458, "grad_norm": 0.14660033583641052, "learning_rate": 0.0001, "loss": 0.1724, "step": 1441 }, { "epoch": 2.350831431366156, "grad_norm": 0.12945188581943512, "learning_rate": 0.0001, "loss": 0.1567, "step": 1442 }, { "epoch": 2.3524616889468537, "grad_norm": 0.13205265998840332, "learning_rate": 0.0001, "loss": 0.1634, "step": 1443 }, { "epoch": 2.3540919465275514, "grad_norm": 0.11146970093250275, "learning_rate": 0.0001, "loss": 0.1698, "step": 1444 }, { "epoch": 2.355722204108249, "grad_norm": 0.1483961045742035, "learning_rate": 0.0001, "loss": 0.1705, "step": 1445 }, { "epoch": 2.3573524616889467, "grad_norm": 0.1443917602300644, "learning_rate": 0.0001, "loss": 0.1881, "step": 1446 }, { "epoch": 2.3589827192696444, "grad_norm": 0.11366801708936691, "learning_rate": 0.0001, "loss": 0.1673, "step": 1447 }, { "epoch": 2.3606129768503425, "grad_norm": 0.12853001058101654, "learning_rate": 0.0001, "loss": 0.1682, "step": 1448 }, { "epoch": 2.3622432344310402, "grad_norm": 0.11675182729959488, "learning_rate": 0.0001, "loss": 0.1607, "step": 1449 }, { "epoch": 2.363873492011738, "grad_norm": 0.13047528266906738, "learning_rate": 0.0001, "loss": 0.1585, "step": 1450 }, { "epoch": 2.3655037495924356, "grad_norm": 0.13428814709186554, "learning_rate": 0.0001, "loss": 0.1742, "step": 1451 }, { "epoch": 2.3671340071731333, "grad_norm": 0.14489330351352692, "learning_rate": 0.0001, "loss": 0.1624, "step": 1452 }, { "epoch": 2.368764264753831, "grad_norm": 0.126490980386734, "learning_rate": 0.0001, "loss": 0.1825, "step": 1453 }, { "epoch": 2.3703945223345286, "grad_norm": 0.13337522745132446, "learning_rate": 0.0001, "loss": 0.1653, "step": 1454 }, { "epoch": 2.3720247799152268, "grad_norm": 0.13690900802612305, "learning_rate": 0.0001, "loss": 0.1849, "step": 1455 }, { "epoch": 2.3736550374959244, "grad_norm": 0.12736913561820984, "learning_rate": 0.0001, "loss": 0.175, "step": 1456 }, { "epoch": 2.375285295076622, "grad_norm": 0.14143235981464386, "learning_rate": 0.0001, "loss": 0.181, "step": 1457 }, { "epoch": 2.37691555265732, "grad_norm": 0.12882289290428162, "learning_rate": 0.0001, "loss": 0.1672, "step": 1458 }, { "epoch": 2.3785458102380175, "grad_norm": 0.12184431403875351, "learning_rate": 0.0001, "loss": 0.1691, "step": 1459 }, { "epoch": 2.3801760678187156, "grad_norm": 0.12170469015836716, "learning_rate": 0.0001, "loss": 0.1721, "step": 1460 }, { "epoch": 2.3818063253994133, "grad_norm": 0.13725103437900543, "learning_rate": 0.0001, "loss": 0.1846, "step": 1461 }, { "epoch": 2.383436582980111, "grad_norm": 0.137665793299675, "learning_rate": 0.0001, "loss": 0.1675, "step": 1462 }, { "epoch": 2.3850668405608086, "grad_norm": 0.11704001575708389, "learning_rate": 0.0001, "loss": 0.1695, "step": 1463 }, { "epoch": 2.3866970981415063, "grad_norm": 0.11113730818033218, "learning_rate": 0.0001, "loss": 0.1621, "step": 1464 }, { "epoch": 2.388327355722204, "grad_norm": 0.1406833976507187, "learning_rate": 0.0001, "loss": 0.179, "step": 1465 }, { "epoch": 2.3899576133029017, "grad_norm": 0.12707440555095673, "learning_rate": 0.0001, "loss": 0.1697, "step": 1466 }, { "epoch": 2.3915878708835994, "grad_norm": 0.14010672271251678, "learning_rate": 0.0001, "loss": 0.1871, "step": 1467 }, { "epoch": 2.3932181284642975, "grad_norm": 0.12332647293806076, "learning_rate": 0.0001, "loss": 0.1621, "step": 1468 }, { "epoch": 2.394848386044995, "grad_norm": 0.1202240139245987, "learning_rate": 0.0001, "loss": 0.1718, "step": 1469 }, { "epoch": 2.396478643625693, "grad_norm": 0.11759907752275467, "learning_rate": 0.0001, "loss": 0.1584, "step": 1470 }, { "epoch": 2.3981089012063905, "grad_norm": 0.12136384099721909, "learning_rate": 0.0001, "loss": 0.1681, "step": 1471 }, { "epoch": 2.399739158787088, "grad_norm": 0.1522362232208252, "learning_rate": 0.0001, "loss": 0.1706, "step": 1472 }, { "epoch": 2.4013694163677863, "grad_norm": 0.13118572533130646, "learning_rate": 0.0001, "loss": 0.1694, "step": 1473 }, { "epoch": 2.402999673948484, "grad_norm": 0.11596754938364029, "learning_rate": 0.0001, "loss": 0.1693, "step": 1474 }, { "epoch": 2.4046299315291817, "grad_norm": 0.13307926058769226, "learning_rate": 0.0001, "loss": 0.1749, "step": 1475 }, { "epoch": 2.4062601891098794, "grad_norm": 0.13675716519355774, "learning_rate": 0.0001, "loss": 0.1717, "step": 1476 }, { "epoch": 2.407890446690577, "grad_norm": 0.17052991688251495, "learning_rate": 0.0001, "loss": 0.1788, "step": 1477 }, { "epoch": 2.4095207042712747, "grad_norm": 0.13308259844779968, "learning_rate": 0.0001, "loss": 0.165, "step": 1478 }, { "epoch": 2.4111509618519724, "grad_norm": 0.13663795590400696, "learning_rate": 0.0001, "loss": 0.177, "step": 1479 }, { "epoch": 2.4127812194326705, "grad_norm": 0.1282254457473755, "learning_rate": 0.0001, "loss": 0.1809, "step": 1480 }, { "epoch": 2.4144114770133682, "grad_norm": 0.14260698854923248, "learning_rate": 0.0001, "loss": 0.1635, "step": 1481 }, { "epoch": 2.416041734594066, "grad_norm": 0.12570828199386597, "learning_rate": 0.0001, "loss": 0.1719, "step": 1482 }, { "epoch": 2.4176719921747636, "grad_norm": 0.13560189306735992, "learning_rate": 0.0001, "loss": 0.1699, "step": 1483 }, { "epoch": 2.4193022497554613, "grad_norm": 0.15842407941818237, "learning_rate": 0.0001, "loss": 0.1759, "step": 1484 }, { "epoch": 2.420932507336159, "grad_norm": 0.13507573306560516, "learning_rate": 0.0001, "loss": 0.1751, "step": 1485 }, { "epoch": 2.422562764916857, "grad_norm": 0.13392452895641327, "learning_rate": 0.0001, "loss": 0.1855, "step": 1486 }, { "epoch": 2.4241930224975548, "grad_norm": 0.14085431396961212, "learning_rate": 0.0001, "loss": 0.1565, "step": 1487 }, { "epoch": 2.4258232800782524, "grad_norm": 0.11966408044099808, "learning_rate": 0.0001, "loss": 0.1696, "step": 1488 }, { "epoch": 2.42745353765895, "grad_norm": 0.14620652794837952, "learning_rate": 0.0001, "loss": 0.179, "step": 1489 }, { "epoch": 2.429083795239648, "grad_norm": 0.15509895980358124, "learning_rate": 0.0001, "loss": 0.1641, "step": 1490 }, { "epoch": 2.4307140528203455, "grad_norm": 0.1289438009262085, "learning_rate": 0.0001, "loss": 0.1754, "step": 1491 }, { "epoch": 2.432344310401043, "grad_norm": 0.1589927077293396, "learning_rate": 0.0001, "loss": 0.1778, "step": 1492 }, { "epoch": 2.4339745679817413, "grad_norm": 0.13043618202209473, "learning_rate": 0.0001, "loss": 0.1866, "step": 1493 }, { "epoch": 2.435604825562439, "grad_norm": 0.12837867438793182, "learning_rate": 0.0001, "loss": 0.1644, "step": 1494 }, { "epoch": 2.4372350831431366, "grad_norm": 0.13602042198181152, "learning_rate": 0.0001, "loss": 0.18, "step": 1495 }, { "epoch": 2.4388653407238343, "grad_norm": 0.12518282234668732, "learning_rate": 0.0001, "loss": 0.1792, "step": 1496 }, { "epoch": 2.440495598304532, "grad_norm": 0.1458752602338791, "learning_rate": 0.0001, "loss": 0.174, "step": 1497 }, { "epoch": 2.4421258558852297, "grad_norm": 0.16408641636371613, "learning_rate": 0.0001, "loss": 0.1638, "step": 1498 }, { "epoch": 2.443756113465928, "grad_norm": 0.1250162422657013, "learning_rate": 0.0001, "loss": 0.1799, "step": 1499 }, { "epoch": 2.4453863710466255, "grad_norm": 0.14285162091255188, "learning_rate": 0.0001, "loss": 0.183, "step": 1500 }, { "epoch": 2.4453863710466255, "eval_loss": 0.19733810424804688, "eval_runtime": 2885.0678, "eval_samples_per_second": 0.654, "eval_steps_per_second": 0.164, "step": 1500 }, { "epoch": 2.447016628627323, "grad_norm": 0.1123531311750412, "learning_rate": 0.0001, "loss": 0.1653, "step": 1501 }, { "epoch": 2.448646886208021, "grad_norm": 0.1342422366142273, "learning_rate": 0.0001, "loss": 0.1652, "step": 1502 }, { "epoch": 2.4502771437887185, "grad_norm": 0.13005055487155914, "learning_rate": 0.0001, "loss": 0.1513, "step": 1503 }, { "epoch": 2.451907401369416, "grad_norm": 0.1291249841451645, "learning_rate": 0.0001, "loss": 0.1658, "step": 1504 }, { "epoch": 2.453537658950114, "grad_norm": 0.15009792149066925, "learning_rate": 0.0001, "loss": 0.1582, "step": 1505 }, { "epoch": 2.455167916530812, "grad_norm": 0.13734357059001923, "learning_rate": 0.0001, "loss": 0.1729, "step": 1506 }, { "epoch": 2.4567981741115097, "grad_norm": 0.14626893401145935, "learning_rate": 0.0001, "loss": 0.1753, "step": 1507 }, { "epoch": 2.4584284316922074, "grad_norm": 0.13441750407218933, "learning_rate": 0.0001, "loss": 0.168, "step": 1508 }, { "epoch": 2.460058689272905, "grad_norm": 0.17883312702178955, "learning_rate": 0.0001, "loss": 0.1724, "step": 1509 }, { "epoch": 2.4616889468536027, "grad_norm": 0.13286323845386505, "learning_rate": 0.0001, "loss": 0.1707, "step": 1510 }, { "epoch": 2.463319204434301, "grad_norm": 0.11390995234251022, "learning_rate": 0.0001, "loss": 0.152, "step": 1511 }, { "epoch": 2.4649494620149985, "grad_norm": 0.13794218003749847, "learning_rate": 0.0001, "loss": 0.1693, "step": 1512 }, { "epoch": 2.466579719595696, "grad_norm": 0.13029181957244873, "learning_rate": 0.0001, "loss": 0.1885, "step": 1513 }, { "epoch": 2.468209977176394, "grad_norm": 0.1590529978275299, "learning_rate": 0.0001, "loss": 0.1656, "step": 1514 }, { "epoch": 2.4698402347570916, "grad_norm": 0.129343181848526, "learning_rate": 0.0001, "loss": 0.1645, "step": 1515 }, { "epoch": 2.4714704923377893, "grad_norm": 0.13485710322856903, "learning_rate": 0.0001, "loss": 0.178, "step": 1516 }, { "epoch": 2.473100749918487, "grad_norm": 0.13252151012420654, "learning_rate": 0.0001, "loss": 0.1807, "step": 1517 }, { "epoch": 2.474731007499185, "grad_norm": 0.14138251543045044, "learning_rate": 0.0001, "loss": 0.1807, "step": 1518 }, { "epoch": 2.4763612650798827, "grad_norm": 0.12066026031970978, "learning_rate": 0.0001, "loss": 0.1579, "step": 1519 }, { "epoch": 2.4779915226605804, "grad_norm": 0.128167986869812, "learning_rate": 0.0001, "loss": 0.1765, "step": 1520 }, { "epoch": 2.479621780241278, "grad_norm": 0.11725836247205734, "learning_rate": 0.0001, "loss": 0.1584, "step": 1521 }, { "epoch": 2.481252037821976, "grad_norm": 0.12235096096992493, "learning_rate": 0.0001, "loss": 0.1752, "step": 1522 }, { "epoch": 2.4828822954026735, "grad_norm": 0.13664376735687256, "learning_rate": 0.0001, "loss": 0.1722, "step": 1523 }, { "epoch": 2.4845125529833716, "grad_norm": 0.12066584080457687, "learning_rate": 0.0001, "loss": 0.1805, "step": 1524 }, { "epoch": 2.4861428105640693, "grad_norm": 0.15553556382656097, "learning_rate": 0.0001, "loss": 0.1763, "step": 1525 }, { "epoch": 2.487773068144767, "grad_norm": 0.11850383877754211, "learning_rate": 0.0001, "loss": 0.1758, "step": 1526 }, { "epoch": 2.4894033257254646, "grad_norm": 0.1156458929181099, "learning_rate": 0.0001, "loss": 0.1723, "step": 1527 }, { "epoch": 2.4910335833061623, "grad_norm": 0.14494523406028748, "learning_rate": 0.0001, "loss": 0.1721, "step": 1528 }, { "epoch": 2.49266384088686, "grad_norm": 0.19197629392147064, "learning_rate": 0.0001, "loss": 0.1844, "step": 1529 }, { "epoch": 2.4942940984675577, "grad_norm": 0.13929398357868195, "learning_rate": 0.0001, "loss": 0.1605, "step": 1530 }, { "epoch": 2.495924356048256, "grad_norm": 0.14377978444099426, "learning_rate": 0.0001, "loss": 0.1757, "step": 1531 }, { "epoch": 2.4975546136289535, "grad_norm": 0.1260727494955063, "learning_rate": 0.0001, "loss": 0.1584, "step": 1532 }, { "epoch": 2.499184871209651, "grad_norm": 0.13934855163097382, "learning_rate": 0.0001, "loss": 0.1906, "step": 1533 }, { "epoch": 2.500815128790349, "grad_norm": 0.1342552751302719, "learning_rate": 0.0001, "loss": 0.1642, "step": 1534 }, { "epoch": 2.5024453863710465, "grad_norm": 0.14849533140659332, "learning_rate": 0.0001, "loss": 0.1774, "step": 1535 }, { "epoch": 2.5040756439517446, "grad_norm": 0.12595170736312866, "learning_rate": 0.0001, "loss": 0.1757, "step": 1536 }, { "epoch": 2.5057059015324423, "grad_norm": 0.15015050768852234, "learning_rate": 0.0001, "loss": 0.1598, "step": 1537 }, { "epoch": 2.50733615911314, "grad_norm": 0.1296335756778717, "learning_rate": 0.0001, "loss": 0.1725, "step": 1538 }, { "epoch": 2.5089664166938377, "grad_norm": 0.12165997922420502, "learning_rate": 0.0001, "loss": 0.1647, "step": 1539 }, { "epoch": 2.5105966742745354, "grad_norm": 0.14188778400421143, "learning_rate": 0.0001, "loss": 0.1675, "step": 1540 }, { "epoch": 2.512226931855233, "grad_norm": 0.12897147238254547, "learning_rate": 0.0001, "loss": 0.1661, "step": 1541 }, { "epoch": 2.5138571894359307, "grad_norm": 0.10982251167297363, "learning_rate": 0.0001, "loss": 0.1734, "step": 1542 }, { "epoch": 2.5154874470166284, "grad_norm": 0.11350741237401962, "learning_rate": 0.0001, "loss": 0.1587, "step": 1543 }, { "epoch": 2.5171177045973265, "grad_norm": 0.13737499713897705, "learning_rate": 0.0001, "loss": 0.1791, "step": 1544 }, { "epoch": 2.518747962178024, "grad_norm": 0.17301884293556213, "learning_rate": 0.0001, "loss": 0.17, "step": 1545 }, { "epoch": 2.520378219758722, "grad_norm": 0.12698844075202942, "learning_rate": 0.0001, "loss": 0.1803, "step": 1546 }, { "epoch": 2.5220084773394196, "grad_norm": 0.1362551897764206, "learning_rate": 0.0001, "loss": 0.1666, "step": 1547 }, { "epoch": 2.5236387349201173, "grad_norm": 0.13497772812843323, "learning_rate": 0.0001, "loss": 0.1757, "step": 1548 }, { "epoch": 2.5252689925008154, "grad_norm": 0.13246221840381622, "learning_rate": 0.0001, "loss": 0.1626, "step": 1549 }, { "epoch": 2.526899250081513, "grad_norm": 0.11976587027311325, "learning_rate": 0.0001, "loss": 0.1704, "step": 1550 }, { "epoch": 2.5285295076622107, "grad_norm": 0.17471329867839813, "learning_rate": 0.0001, "loss": 0.1743, "step": 1551 }, { "epoch": 2.5301597652429084, "grad_norm": 0.1331670731306076, "learning_rate": 0.0001, "loss": 0.1565, "step": 1552 }, { "epoch": 2.531790022823606, "grad_norm": 0.12313182651996613, "learning_rate": 0.0001, "loss": 0.1508, "step": 1553 }, { "epoch": 2.533420280404304, "grad_norm": 0.14300191402435303, "learning_rate": 0.0001, "loss": 0.1825, "step": 1554 }, { "epoch": 2.5350505379850015, "grad_norm": 0.14361177384853363, "learning_rate": 0.0001, "loss": 0.1751, "step": 1555 }, { "epoch": 2.536680795565699, "grad_norm": 0.1338524967432022, "learning_rate": 0.0001, "loss": 0.1714, "step": 1556 }, { "epoch": 2.5383110531463973, "grad_norm": 0.12020072340965271, "learning_rate": 0.0001, "loss": 0.1745, "step": 1557 }, { "epoch": 2.539941310727095, "grad_norm": 0.1332116425037384, "learning_rate": 0.0001, "loss": 0.1644, "step": 1558 }, { "epoch": 2.5415715683077926, "grad_norm": 0.1357092410326004, "learning_rate": 0.0001, "loss": 0.1667, "step": 1559 }, { "epoch": 2.5432018258884903, "grad_norm": 0.12332719564437866, "learning_rate": 0.0001, "loss": 0.178, "step": 1560 }, { "epoch": 2.544832083469188, "grad_norm": 0.11925667524337769, "learning_rate": 0.0001, "loss": 0.1736, "step": 1561 }, { "epoch": 2.546462341049886, "grad_norm": 0.14202888309955597, "learning_rate": 0.0001, "loss": 0.1795, "step": 1562 }, { "epoch": 2.548092598630584, "grad_norm": 0.18294650316238403, "learning_rate": 0.0001, "loss": 0.1859, "step": 1563 }, { "epoch": 2.5497228562112815, "grad_norm": 0.14184100925922394, "learning_rate": 0.0001, "loss": 0.1628, "step": 1564 }, { "epoch": 2.551353113791979, "grad_norm": 0.1371891051530838, "learning_rate": 0.0001, "loss": 0.18, "step": 1565 }, { "epoch": 2.552983371372677, "grad_norm": 0.20214851200580597, "learning_rate": 0.0001, "loss": 0.1813, "step": 1566 }, { "epoch": 2.5546136289533745, "grad_norm": 0.12881435453891754, "learning_rate": 0.0001, "loss": 0.1657, "step": 1567 }, { "epoch": 2.556243886534072, "grad_norm": 0.1161297932267189, "learning_rate": 0.0001, "loss": 0.1641, "step": 1568 }, { "epoch": 2.55787414411477, "grad_norm": 0.13545578718185425, "learning_rate": 0.0001, "loss": 0.17, "step": 1569 }, { "epoch": 2.559504401695468, "grad_norm": 0.13674761354923248, "learning_rate": 0.0001, "loss": 0.175, "step": 1570 }, { "epoch": 2.5611346592761657, "grad_norm": 0.12067017704248428, "learning_rate": 0.0001, "loss": 0.1545, "step": 1571 }, { "epoch": 2.5627649168568634, "grad_norm": 0.13512328267097473, "learning_rate": 0.0001, "loss": 0.1706, "step": 1572 }, { "epoch": 2.564395174437561, "grad_norm": 0.14814037084579468, "learning_rate": 0.0001, "loss": 0.1824, "step": 1573 }, { "epoch": 2.5660254320182587, "grad_norm": 0.13570277392864227, "learning_rate": 0.0001, "loss": 0.18, "step": 1574 }, { "epoch": 2.567655689598957, "grad_norm": 0.14428043365478516, "learning_rate": 0.0001, "loss": 0.176, "step": 1575 }, { "epoch": 2.5692859471796545, "grad_norm": 0.11816335469484329, "learning_rate": 0.0001, "loss": 0.1658, "step": 1576 }, { "epoch": 2.570916204760352, "grad_norm": 0.12405506521463394, "learning_rate": 0.0001, "loss": 0.1704, "step": 1577 }, { "epoch": 2.57254646234105, "grad_norm": 0.12209967523813248, "learning_rate": 0.0001, "loss": 0.1683, "step": 1578 }, { "epoch": 2.5741767199217476, "grad_norm": 0.12708422541618347, "learning_rate": 0.0001, "loss": 0.1846, "step": 1579 }, { "epoch": 2.5758069775024452, "grad_norm": 0.13518854975700378, "learning_rate": 0.0001, "loss": 0.1692, "step": 1580 }, { "epoch": 2.577437235083143, "grad_norm": 0.1394702047109604, "learning_rate": 0.0001, "loss": 0.1793, "step": 1581 }, { "epoch": 2.5790674926638406, "grad_norm": 0.1345827430486679, "learning_rate": 0.0001, "loss": 0.1578, "step": 1582 }, { "epoch": 2.5806977502445387, "grad_norm": 0.1303935945034027, "learning_rate": 0.0001, "loss": 0.1754, "step": 1583 }, { "epoch": 2.5823280078252364, "grad_norm": 0.1360052078962326, "learning_rate": 0.0001, "loss": 0.1825, "step": 1584 }, { "epoch": 2.583958265405934, "grad_norm": 0.1259981095790863, "learning_rate": 0.0001, "loss": 0.1651, "step": 1585 }, { "epoch": 2.5855885229866318, "grad_norm": 0.13449037075042725, "learning_rate": 0.0001, "loss": 0.1901, "step": 1586 }, { "epoch": 2.58721878056733, "grad_norm": 0.14254209399223328, "learning_rate": 0.0001, "loss": 0.187, "step": 1587 }, { "epoch": 2.5888490381480276, "grad_norm": 0.11680305004119873, "learning_rate": 0.0001, "loss": 0.1669, "step": 1588 }, { "epoch": 2.5904792957287253, "grad_norm": 0.12670142948627472, "learning_rate": 0.0001, "loss": 0.1659, "step": 1589 }, { "epoch": 2.592109553309423, "grad_norm": 0.11500969529151917, "learning_rate": 0.0001, "loss": 0.1523, "step": 1590 }, { "epoch": 2.5937398108901206, "grad_norm": 0.15863120555877686, "learning_rate": 0.0001, "loss": 0.1731, "step": 1591 }, { "epoch": 2.5953700684708183, "grad_norm": 0.12249530106782913, "learning_rate": 0.0001, "loss": 0.1722, "step": 1592 }, { "epoch": 2.597000326051516, "grad_norm": 0.1352062076330185, "learning_rate": 0.0001, "loss": 0.1787, "step": 1593 }, { "epoch": 2.5986305836322137, "grad_norm": 0.14207401871681213, "learning_rate": 0.0001, "loss": 0.1954, "step": 1594 }, { "epoch": 2.600260841212912, "grad_norm": 0.12589991092681885, "learning_rate": 0.0001, "loss": 0.161, "step": 1595 }, { "epoch": 2.6018910987936095, "grad_norm": 0.15374112129211426, "learning_rate": 0.0001, "loss": 0.1631, "step": 1596 }, { "epoch": 2.603521356374307, "grad_norm": 0.17806057631969452, "learning_rate": 0.0001, "loss": 0.1749, "step": 1597 }, { "epoch": 2.605151613955005, "grad_norm": 0.13044816255569458, "learning_rate": 0.0001, "loss": 0.1602, "step": 1598 }, { "epoch": 2.6067818715357025, "grad_norm": 0.13261814415454865, "learning_rate": 0.0001, "loss": 0.175, "step": 1599 }, { "epoch": 2.6084121291164006, "grad_norm": 0.16431686282157898, "learning_rate": 0.0001, "loss": 0.1711, "step": 1600 }, { "epoch": 2.6100423866970983, "grad_norm": 0.1591222733259201, "learning_rate": 0.0001, "loss": 0.1782, "step": 1601 }, { "epoch": 2.611672644277796, "grad_norm": 0.15045471489429474, "learning_rate": 0.0001, "loss": 0.1576, "step": 1602 }, { "epoch": 2.6133029018584937, "grad_norm": 0.14002764225006104, "learning_rate": 0.0001, "loss": 0.1731, "step": 1603 }, { "epoch": 2.6149331594391914, "grad_norm": 0.13808737695217133, "learning_rate": 0.0001, "loss": 0.1754, "step": 1604 }, { "epoch": 2.616563417019889, "grad_norm": 0.14910684525966644, "learning_rate": 0.0001, "loss": 0.1834, "step": 1605 }, { "epoch": 2.6181936746005867, "grad_norm": 0.1458890289068222, "learning_rate": 0.0001, "loss": 0.173, "step": 1606 }, { "epoch": 2.6198239321812844, "grad_norm": 0.13817395269870758, "learning_rate": 0.0001, "loss": 0.1745, "step": 1607 }, { "epoch": 2.6214541897619825, "grad_norm": 0.22925616800785065, "learning_rate": 0.0001, "loss": 0.1693, "step": 1608 }, { "epoch": 2.62308444734268, "grad_norm": 0.13849849998950958, "learning_rate": 0.0001, "loss": 0.1734, "step": 1609 }, { "epoch": 2.624714704923378, "grad_norm": 0.12482542544603348, "learning_rate": 0.0001, "loss": 0.1864, "step": 1610 }, { "epoch": 2.6263449625040756, "grad_norm": 0.12124403566122055, "learning_rate": 0.0001, "loss": 0.1803, "step": 1611 }, { "epoch": 2.6279752200847732, "grad_norm": 0.12327743321657181, "learning_rate": 0.0001, "loss": 0.1581, "step": 1612 }, { "epoch": 2.6296054776654714, "grad_norm": 0.17278893291950226, "learning_rate": 0.0001, "loss": 0.1637, "step": 1613 }, { "epoch": 2.631235735246169, "grad_norm": 0.14170318841934204, "learning_rate": 0.0001, "loss": 0.1793, "step": 1614 }, { "epoch": 2.6328659928268667, "grad_norm": 0.1279682219028473, "learning_rate": 0.0001, "loss": 0.1657, "step": 1615 }, { "epoch": 2.6344962504075644, "grad_norm": 0.1350080519914627, "learning_rate": 0.0001, "loss": 0.1854, "step": 1616 }, { "epoch": 2.636126507988262, "grad_norm": 0.1315418928861618, "learning_rate": 0.0001, "loss": 0.1732, "step": 1617 }, { "epoch": 2.6377567655689598, "grad_norm": 0.13384385406970978, "learning_rate": 0.0001, "loss": 0.1727, "step": 1618 }, { "epoch": 2.6393870231496575, "grad_norm": 0.4788396954536438, "learning_rate": 0.0001, "loss": 0.1705, "step": 1619 }, { "epoch": 2.641017280730355, "grad_norm": 0.15386568009853363, "learning_rate": 0.0001, "loss": 0.1653, "step": 1620 }, { "epoch": 2.6426475383110533, "grad_norm": 0.12126284837722778, "learning_rate": 0.0001, "loss": 0.1645, "step": 1621 }, { "epoch": 2.644277795891751, "grad_norm": 0.11877623945474625, "learning_rate": 0.0001, "loss": 0.1656, "step": 1622 }, { "epoch": 2.6459080534724486, "grad_norm": 0.13270951807498932, "learning_rate": 0.0001, "loss": 0.1861, "step": 1623 }, { "epoch": 2.6475383110531463, "grad_norm": 0.13268467783927917, "learning_rate": 0.0001, "loss": 0.1803, "step": 1624 }, { "epoch": 2.6491685686338444, "grad_norm": 0.11460261046886444, "learning_rate": 0.0001, "loss": 0.1849, "step": 1625 }, { "epoch": 2.650798826214542, "grad_norm": 0.1381942331790924, "learning_rate": 0.0001, "loss": 0.1726, "step": 1626 }, { "epoch": 2.65242908379524, "grad_norm": 0.11194758862257004, "learning_rate": 0.0001, "loss": 0.174, "step": 1627 }, { "epoch": 2.6540593413759375, "grad_norm": 0.13582143187522888, "learning_rate": 0.0001, "loss": 0.188, "step": 1628 }, { "epoch": 2.655689598956635, "grad_norm": 0.13221515715122223, "learning_rate": 0.0001, "loss": 0.1784, "step": 1629 }, { "epoch": 2.657319856537333, "grad_norm": 0.13104896247386932, "learning_rate": 0.0001, "loss": 0.1644, "step": 1630 }, { "epoch": 2.6589501141180305, "grad_norm": 0.15535131096839905, "learning_rate": 0.0001, "loss": 0.1723, "step": 1631 }, { "epoch": 2.660580371698728, "grad_norm": 0.1257065385580063, "learning_rate": 0.0001, "loss": 0.1437, "step": 1632 }, { "epoch": 2.6622106292794263, "grad_norm": 0.12850813567638397, "learning_rate": 0.0001, "loss": 0.1735, "step": 1633 }, { "epoch": 2.663840886860124, "grad_norm": 0.12646383047103882, "learning_rate": 0.0001, "loss": 0.1771, "step": 1634 }, { "epoch": 2.6654711444408217, "grad_norm": 0.2852911651134491, "learning_rate": 0.0001, "loss": 0.1851, "step": 1635 }, { "epoch": 2.6671014020215194, "grad_norm": 0.11714810878038406, "learning_rate": 0.0001, "loss": 0.1549, "step": 1636 }, { "epoch": 2.668731659602217, "grad_norm": 0.14019465446472168, "learning_rate": 0.0001, "loss": 0.185, "step": 1637 }, { "epoch": 2.670361917182915, "grad_norm": 0.12677006423473358, "learning_rate": 0.0001, "loss": 0.1657, "step": 1638 }, { "epoch": 2.671992174763613, "grad_norm": 0.12617303431034088, "learning_rate": 0.0001, "loss": 0.1729, "step": 1639 }, { "epoch": 2.6736224323443105, "grad_norm": 0.1309848427772522, "learning_rate": 0.0001, "loss": 0.1661, "step": 1640 }, { "epoch": 2.675252689925008, "grad_norm": 0.1385383903980255, "learning_rate": 0.0001, "loss": 0.1794, "step": 1641 }, { "epoch": 2.676882947505706, "grad_norm": 0.1279730498790741, "learning_rate": 0.0001, "loss": 0.1661, "step": 1642 }, { "epoch": 2.6785132050864036, "grad_norm": 0.12799672782421112, "learning_rate": 0.0001, "loss": 0.1628, "step": 1643 }, { "epoch": 2.6801434626671012, "grad_norm": 0.16915291547775269, "learning_rate": 0.0001, "loss": 0.1707, "step": 1644 }, { "epoch": 2.681773720247799, "grad_norm": 0.12088014930486679, "learning_rate": 0.0001, "loss": 0.1671, "step": 1645 }, { "epoch": 2.683403977828497, "grad_norm": 0.13347816467285156, "learning_rate": 0.0001, "loss": 0.1715, "step": 1646 }, { "epoch": 2.6850342354091947, "grad_norm": 0.12426477670669556, "learning_rate": 0.0001, "loss": 0.1917, "step": 1647 }, { "epoch": 2.6866644929898924, "grad_norm": 0.16994018852710724, "learning_rate": 0.0001, "loss": 0.1929, "step": 1648 }, { "epoch": 2.68829475057059, "grad_norm": 0.1453407108783722, "learning_rate": 0.0001, "loss": 0.1711, "step": 1649 }, { "epoch": 2.6899250081512878, "grad_norm": 0.15362900495529175, "learning_rate": 0.0001, "loss": 0.1829, "step": 1650 }, { "epoch": 2.691555265731986, "grad_norm": 0.15922309458255768, "learning_rate": 0.0001, "loss": 0.1655, "step": 1651 }, { "epoch": 2.6931855233126836, "grad_norm": 0.153437077999115, "learning_rate": 0.0001, "loss": 0.1776, "step": 1652 }, { "epoch": 2.6948157808933813, "grad_norm": 0.15213799476623535, "learning_rate": 0.0001, "loss": 0.1835, "step": 1653 }, { "epoch": 2.696446038474079, "grad_norm": 0.10505233705043793, "learning_rate": 0.0001, "loss": 0.1501, "step": 1654 }, { "epoch": 2.6980762960547766, "grad_norm": 0.13109369575977325, "learning_rate": 0.0001, "loss": 0.1601, "step": 1655 }, { "epoch": 2.6997065536354743, "grad_norm": 0.1327008157968521, "learning_rate": 0.0001, "loss": 0.1781, "step": 1656 }, { "epoch": 2.701336811216172, "grad_norm": 0.12708407640457153, "learning_rate": 0.0001, "loss": 0.1599, "step": 1657 }, { "epoch": 2.7029670687968697, "grad_norm": 0.12450224906206131, "learning_rate": 0.0001, "loss": 0.1711, "step": 1658 }, { "epoch": 2.704597326377568, "grad_norm": 0.13624098896980286, "learning_rate": 0.0001, "loss": 0.1636, "step": 1659 }, { "epoch": 2.7062275839582655, "grad_norm": 0.1352638155221939, "learning_rate": 0.0001, "loss": 0.1732, "step": 1660 }, { "epoch": 2.707857841538963, "grad_norm": 0.1240679994225502, "learning_rate": 0.0001, "loss": 0.1715, "step": 1661 }, { "epoch": 2.709488099119661, "grad_norm": 0.131019726395607, "learning_rate": 0.0001, "loss": 0.1743, "step": 1662 }, { "epoch": 2.7111183567003585, "grad_norm": 0.11371717602014542, "learning_rate": 0.0001, "loss": 0.1653, "step": 1663 }, { "epoch": 2.7127486142810566, "grad_norm": 0.1314827799797058, "learning_rate": 0.0001, "loss": 0.1788, "step": 1664 }, { "epoch": 2.7143788718617543, "grad_norm": 0.11662380397319794, "learning_rate": 0.0001, "loss": 0.1596, "step": 1665 }, { "epoch": 2.716009129442452, "grad_norm": 0.18013271689414978, "learning_rate": 0.0001, "loss": 0.1539, "step": 1666 }, { "epoch": 2.7176393870231497, "grad_norm": 0.1361352503299713, "learning_rate": 0.0001, "loss": 0.1683, "step": 1667 }, { "epoch": 2.7192696446038473, "grad_norm": 0.12468544393777847, "learning_rate": 0.0001, "loss": 0.1651, "step": 1668 }, { "epoch": 2.720899902184545, "grad_norm": 0.14304213225841522, "learning_rate": 0.0001, "loss": 0.1766, "step": 1669 }, { "epoch": 2.7225301597652427, "grad_norm": 0.13735635578632355, "learning_rate": 0.0001, "loss": 0.1828, "step": 1670 }, { "epoch": 2.7241604173459404, "grad_norm": 0.1301272064447403, "learning_rate": 0.0001, "loss": 0.1694, "step": 1671 }, { "epoch": 2.7257906749266385, "grad_norm": 0.13478688895702362, "learning_rate": 0.0001, "loss": 0.1809, "step": 1672 }, { "epoch": 2.727420932507336, "grad_norm": 0.1462695151567459, "learning_rate": 0.0001, "loss": 0.1605, "step": 1673 }, { "epoch": 2.729051190088034, "grad_norm": 0.13804949820041656, "learning_rate": 0.0001, "loss": 0.1672, "step": 1674 }, { "epoch": 2.7306814476687316, "grad_norm": 0.12185340374708176, "learning_rate": 0.0001, "loss": 0.1665, "step": 1675 }, { "epoch": 2.7323117052494297, "grad_norm": 0.13339075446128845, "learning_rate": 0.0001, "loss": 0.1788, "step": 1676 }, { "epoch": 2.7339419628301274, "grad_norm": 0.12658098340034485, "learning_rate": 0.0001, "loss": 0.1549, "step": 1677 }, { "epoch": 2.735572220410825, "grad_norm": 0.12869013845920563, "learning_rate": 0.0001, "loss": 0.1788, "step": 1678 }, { "epoch": 2.7372024779915227, "grad_norm": 0.12980249524116516, "learning_rate": 0.0001, "loss": 0.1783, "step": 1679 }, { "epoch": 2.7388327355722204, "grad_norm": 0.14288075268268585, "learning_rate": 0.0001, "loss": 0.1708, "step": 1680 }, { "epoch": 2.740462993152918, "grad_norm": 0.15478843450546265, "learning_rate": 0.0001, "loss": 0.1803, "step": 1681 }, { "epoch": 2.7420932507336158, "grad_norm": 0.13585714995861053, "learning_rate": 0.0001, "loss": 0.185, "step": 1682 }, { "epoch": 2.7437235083143134, "grad_norm": 0.1448957920074463, "learning_rate": 0.0001, "loss": 0.1716, "step": 1683 }, { "epoch": 2.7453537658950116, "grad_norm": 0.14890708029270172, "learning_rate": 0.0001, "loss": 0.181, "step": 1684 }, { "epoch": 2.7469840234757092, "grad_norm": 0.13350339233875275, "learning_rate": 0.0001, "loss": 0.1757, "step": 1685 }, { "epoch": 2.748614281056407, "grad_norm": 0.1049543023109436, "learning_rate": 0.0001, "loss": 0.1668, "step": 1686 }, { "epoch": 2.7502445386371046, "grad_norm": 3.6135470867156982, "learning_rate": 0.0001, "loss": 0.2121, "step": 1687 }, { "epoch": 2.7518747962178023, "grad_norm": 0.1557306945323944, "learning_rate": 0.0001, "loss": 0.1742, "step": 1688 }, { "epoch": 2.7535050537985004, "grad_norm": 0.1619507372379303, "learning_rate": 0.0001, "loss": 0.1586, "step": 1689 }, { "epoch": 2.755135311379198, "grad_norm": 0.12189088016748428, "learning_rate": 0.0001, "loss": 0.1789, "step": 1690 }, { "epoch": 2.7567655689598958, "grad_norm": 0.15941926836967468, "learning_rate": 0.0001, "loss": 0.1727, "step": 1691 }, { "epoch": 2.7583958265405935, "grad_norm": 0.11842834204435349, "learning_rate": 0.0001, "loss": 0.1592, "step": 1692 }, { "epoch": 2.760026084121291, "grad_norm": 0.1222245842218399, "learning_rate": 0.0001, "loss": 0.1593, "step": 1693 }, { "epoch": 2.761656341701989, "grad_norm": 0.1369628608226776, "learning_rate": 0.0001, "loss": 0.155, "step": 1694 }, { "epoch": 2.7632865992826865, "grad_norm": 0.14212529361248016, "learning_rate": 0.0001, "loss": 0.1681, "step": 1695 }, { "epoch": 2.764916856863384, "grad_norm": 0.14328838884830475, "learning_rate": 0.0001, "loss": 0.1844, "step": 1696 }, { "epoch": 2.7665471144440823, "grad_norm": 0.14077410101890564, "learning_rate": 0.0001, "loss": 0.1856, "step": 1697 }, { "epoch": 2.76817737202478, "grad_norm": 0.13928896188735962, "learning_rate": 0.0001, "loss": 0.1718, "step": 1698 }, { "epoch": 2.7698076296054777, "grad_norm": 0.13314402103424072, "learning_rate": 0.0001, "loss": 0.1705, "step": 1699 }, { "epoch": 2.7714378871861753, "grad_norm": 0.1279967725276947, "learning_rate": 0.0001, "loss": 0.1758, "step": 1700 }, { "epoch": 2.773068144766873, "grad_norm": 0.11493542045354843, "learning_rate": 0.0001, "loss": 0.1539, "step": 1701 }, { "epoch": 2.774698402347571, "grad_norm": 0.12252707779407501, "learning_rate": 0.0001, "loss": 0.1752, "step": 1702 }, { "epoch": 2.776328659928269, "grad_norm": 0.19633330404758453, "learning_rate": 0.0001, "loss": 0.1582, "step": 1703 }, { "epoch": 2.7779589175089665, "grad_norm": 0.1423460990190506, "learning_rate": 0.0001, "loss": 0.1668, "step": 1704 }, { "epoch": 2.779589175089664, "grad_norm": 0.13523077964782715, "learning_rate": 0.0001, "loss": 0.1843, "step": 1705 }, { "epoch": 2.781219432670362, "grad_norm": 0.13212746381759644, "learning_rate": 0.0001, "loss": 0.1783, "step": 1706 }, { "epoch": 2.7828496902510595, "grad_norm": 0.1263880431652069, "learning_rate": 0.0001, "loss": 0.1579, "step": 1707 }, { "epoch": 2.7844799478317572, "grad_norm": 0.12073154002428055, "learning_rate": 0.0001, "loss": 0.1569, "step": 1708 }, { "epoch": 2.786110205412455, "grad_norm": 0.14019356667995453, "learning_rate": 0.0001, "loss": 0.1809, "step": 1709 }, { "epoch": 2.787740462993153, "grad_norm": 0.11406267434358597, "learning_rate": 0.0001, "loss": 0.1814, "step": 1710 }, { "epoch": 2.7893707205738507, "grad_norm": 0.11895663291215897, "learning_rate": 0.0001, "loss": 0.1783, "step": 1711 }, { "epoch": 2.7910009781545484, "grad_norm": 0.14709694683551788, "learning_rate": 0.0001, "loss": 0.1686, "step": 1712 }, { "epoch": 2.792631235735246, "grad_norm": 0.13388898968696594, "learning_rate": 0.0001, "loss": 0.1654, "step": 1713 }, { "epoch": 2.7942614933159438, "grad_norm": 0.1226249486207962, "learning_rate": 0.0001, "loss": 0.1739, "step": 1714 }, { "epoch": 2.795891750896642, "grad_norm": 0.12996350228786469, "learning_rate": 0.0001, "loss": 0.1699, "step": 1715 }, { "epoch": 2.7975220084773396, "grad_norm": 0.13227185606956482, "learning_rate": 0.0001, "loss": 0.1774, "step": 1716 }, { "epoch": 2.7991522660580372, "grad_norm": 0.12332453578710556, "learning_rate": 0.0001, "loss": 0.1602, "step": 1717 }, { "epoch": 2.800782523638735, "grad_norm": 0.1187395229935646, "learning_rate": 0.0001, "loss": 0.1578, "step": 1718 }, { "epoch": 2.8024127812194326, "grad_norm": 0.13751424849033356, "learning_rate": 0.0001, "loss": 0.1734, "step": 1719 }, { "epoch": 2.8040430388001303, "grad_norm": 0.13629640638828278, "learning_rate": 0.0001, "loss": 0.173, "step": 1720 }, { "epoch": 2.805673296380828, "grad_norm": 0.11772475391626358, "learning_rate": 0.0001, "loss": 0.1677, "step": 1721 }, { "epoch": 2.8073035539615256, "grad_norm": 0.13224674761295319, "learning_rate": 0.0001, "loss": 0.1757, "step": 1722 }, { "epoch": 2.8089338115422238, "grad_norm": 0.1200365424156189, "learning_rate": 0.0001, "loss": 0.1792, "step": 1723 }, { "epoch": 2.8105640691229214, "grad_norm": 0.14300167560577393, "learning_rate": 0.0001, "loss": 0.1748, "step": 1724 }, { "epoch": 2.812194326703619, "grad_norm": 0.10958956182003021, "learning_rate": 0.0001, "loss": 0.1633, "step": 1725 }, { "epoch": 2.813824584284317, "grad_norm": 0.13673417270183563, "learning_rate": 0.0001, "loss": 0.1618, "step": 1726 }, { "epoch": 2.815454841865015, "grad_norm": 0.1306380033493042, "learning_rate": 0.0001, "loss": 0.183, "step": 1727 }, { "epoch": 2.8170850994457126, "grad_norm": 0.12033358216285706, "learning_rate": 0.0001, "loss": 0.1616, "step": 1728 }, { "epoch": 2.8187153570264103, "grad_norm": 0.14117878675460815, "learning_rate": 0.0001, "loss": 0.1715, "step": 1729 }, { "epoch": 2.820345614607108, "grad_norm": 0.13309049606323242, "learning_rate": 0.0001, "loss": 0.1773, "step": 1730 }, { "epoch": 2.8219758721878057, "grad_norm": 0.12040352821350098, "learning_rate": 0.0001, "loss": 0.1656, "step": 1731 }, { "epoch": 2.8236061297685033, "grad_norm": 0.12827961146831512, "learning_rate": 0.0001, "loss": 0.1661, "step": 1732 }, { "epoch": 2.825236387349201, "grad_norm": 0.1280909925699234, "learning_rate": 0.0001, "loss": 0.1806, "step": 1733 }, { "epoch": 2.8268666449298987, "grad_norm": 0.12497354298830032, "learning_rate": 0.0001, "loss": 0.1788, "step": 1734 }, { "epoch": 2.828496902510597, "grad_norm": 0.1294706016778946, "learning_rate": 0.0001, "loss": 0.1743, "step": 1735 }, { "epoch": 2.8301271600912945, "grad_norm": 0.14673174917697906, "learning_rate": 0.0001, "loss": 0.168, "step": 1736 }, { "epoch": 2.831757417671992, "grad_norm": 0.14964553713798523, "learning_rate": 0.0001, "loss": 0.1751, "step": 1737 }, { "epoch": 2.83338767525269, "grad_norm": 0.17080165445804596, "learning_rate": 0.0001, "loss": 0.1651, "step": 1738 }, { "epoch": 2.8350179328333875, "grad_norm": 0.12748171389102936, "learning_rate": 0.0001, "loss": 0.1565, "step": 1739 }, { "epoch": 2.8366481904140857, "grad_norm": 0.1372041255235672, "learning_rate": 0.0001, "loss": 0.1816, "step": 1740 }, { "epoch": 2.8382784479947833, "grad_norm": 0.13018721342086792, "learning_rate": 0.0001, "loss": 0.1758, "step": 1741 }, { "epoch": 2.839908705575481, "grad_norm": 0.13896982371807098, "learning_rate": 0.0001, "loss": 0.1713, "step": 1742 }, { "epoch": 2.8415389631561787, "grad_norm": 0.13519124686717987, "learning_rate": 0.0001, "loss": 0.1826, "step": 1743 }, { "epoch": 2.8431692207368764, "grad_norm": 0.12529662251472473, "learning_rate": 0.0001, "loss": 0.1657, "step": 1744 }, { "epoch": 2.844799478317574, "grad_norm": 0.1385057270526886, "learning_rate": 0.0001, "loss": 0.1742, "step": 1745 }, { "epoch": 2.8464297358982718, "grad_norm": 0.13610967993736267, "learning_rate": 0.0001, "loss": 0.1729, "step": 1746 }, { "epoch": 2.8480599934789694, "grad_norm": 0.1311800330877304, "learning_rate": 0.0001, "loss": 0.1829, "step": 1747 }, { "epoch": 2.8496902510596676, "grad_norm": 0.13023614883422852, "learning_rate": 0.0001, "loss": 0.1679, "step": 1748 }, { "epoch": 2.8513205086403652, "grad_norm": 0.12245821207761765, "learning_rate": 0.0001, "loss": 0.1699, "step": 1749 }, { "epoch": 2.852950766221063, "grad_norm": 0.1297629326581955, "learning_rate": 0.0001, "loss": 0.172, "step": 1750 }, { "epoch": 2.8545810238017606, "grad_norm": 0.12192010134458542, "learning_rate": 0.0001, "loss": 0.1727, "step": 1751 }, { "epoch": 2.8562112813824583, "grad_norm": 0.12483784556388855, "learning_rate": 0.0001, "loss": 0.1658, "step": 1752 }, { "epoch": 2.8578415389631564, "grad_norm": 0.1716589629650116, "learning_rate": 0.0001, "loss": 0.1795, "step": 1753 }, { "epoch": 2.859471796543854, "grad_norm": 0.1266416311264038, "learning_rate": 0.0001, "loss": 0.1572, "step": 1754 }, { "epoch": 2.8611020541245518, "grad_norm": 0.15566131472587585, "learning_rate": 0.0001, "loss": 0.1812, "step": 1755 }, { "epoch": 2.8627323117052494, "grad_norm": 0.1435212343931198, "learning_rate": 0.0001, "loss": 0.1682, "step": 1756 }, { "epoch": 2.864362569285947, "grad_norm": 0.13166531920433044, "learning_rate": 0.0001, "loss": 0.17, "step": 1757 }, { "epoch": 2.865992826866645, "grad_norm": 0.1274840533733368, "learning_rate": 0.0001, "loss": 0.1661, "step": 1758 }, { "epoch": 2.8676230844473425, "grad_norm": 0.1092241182923317, "learning_rate": 0.0001, "loss": 0.1655, "step": 1759 }, { "epoch": 2.86925334202804, "grad_norm": 0.13460928201675415, "learning_rate": 0.0001, "loss": 0.1627, "step": 1760 }, { "epoch": 2.8708835996087383, "grad_norm": 0.12913478910923004, "learning_rate": 0.0001, "loss": 0.1618, "step": 1761 }, { "epoch": 2.872513857189436, "grad_norm": 0.11661716550588608, "learning_rate": 0.0001, "loss": 0.169, "step": 1762 }, { "epoch": 2.8741441147701337, "grad_norm": 0.11179400235414505, "learning_rate": 0.0001, "loss": 0.1725, "step": 1763 }, { "epoch": 2.8757743723508313, "grad_norm": 0.1252732276916504, "learning_rate": 0.0001, "loss": 0.1827, "step": 1764 }, { "epoch": 2.8774046299315295, "grad_norm": 0.3473842144012451, "learning_rate": 0.0001, "loss": 0.1693, "step": 1765 }, { "epoch": 2.879034887512227, "grad_norm": 0.1262052208185196, "learning_rate": 0.0001, "loss": 0.1728, "step": 1766 }, { "epoch": 2.880665145092925, "grad_norm": 0.1326300948858261, "learning_rate": 0.0001, "loss": 0.1744, "step": 1767 }, { "epoch": 2.8822954026736225, "grad_norm": 0.1303003877401352, "learning_rate": 0.0001, "loss": 0.1714, "step": 1768 }, { "epoch": 2.88392566025432, "grad_norm": 0.12167539447546005, "learning_rate": 0.0001, "loss": 0.1677, "step": 1769 }, { "epoch": 2.885555917835018, "grad_norm": 0.13617132604122162, "learning_rate": 0.0001, "loss": 0.1646, "step": 1770 }, { "epoch": 2.8871861754157155, "grad_norm": 0.13883374631404877, "learning_rate": 0.0001, "loss": 0.173, "step": 1771 }, { "epoch": 2.888816432996413, "grad_norm": 0.12531040608882904, "learning_rate": 0.0001, "loss": 0.1678, "step": 1772 }, { "epoch": 2.8904466905771113, "grad_norm": 0.13892248272895813, "learning_rate": 0.0001, "loss": 0.1677, "step": 1773 }, { "epoch": 2.892076948157809, "grad_norm": 0.14446908235549927, "learning_rate": 0.0001, "loss": 0.1666, "step": 1774 }, { "epoch": 2.8937072057385067, "grad_norm": 0.14126956462860107, "learning_rate": 0.0001, "loss": 0.1626, "step": 1775 }, { "epoch": 2.8953374633192044, "grad_norm": 0.1542399823665619, "learning_rate": 0.0001, "loss": 0.1741, "step": 1776 }, { "epoch": 2.896967720899902, "grad_norm": 0.14131003618240356, "learning_rate": 0.0001, "loss": 0.174, "step": 1777 }, { "epoch": 2.8985979784806, "grad_norm": 0.15592537820339203, "learning_rate": 0.0001, "loss": 0.1847, "step": 1778 }, { "epoch": 2.900228236061298, "grad_norm": 0.14722147583961487, "learning_rate": 0.0001, "loss": 0.1804, "step": 1779 }, { "epoch": 2.9018584936419956, "grad_norm": 0.13757076859474182, "learning_rate": 0.0001, "loss": 0.1856, "step": 1780 }, { "epoch": 2.9034887512226932, "grad_norm": 0.2536503076553345, "learning_rate": 0.0001, "loss": 0.1513, "step": 1781 }, { "epoch": 2.905119008803391, "grad_norm": 0.11684080958366394, "learning_rate": 0.0001, "loss": 0.1723, "step": 1782 }, { "epoch": 2.9067492663840886, "grad_norm": 0.1301044374704361, "learning_rate": 0.0001, "loss": 0.1756, "step": 1783 }, { "epoch": 2.9083795239647863, "grad_norm": 0.11158990114927292, "learning_rate": 0.0001, "loss": 0.1627, "step": 1784 }, { "epoch": 2.910009781545484, "grad_norm": 0.14631851017475128, "learning_rate": 0.0001, "loss": 0.1763, "step": 1785 }, { "epoch": 2.911640039126182, "grad_norm": 0.12340549379587173, "learning_rate": 0.0001, "loss": 0.1826, "step": 1786 }, { "epoch": 2.9132702967068798, "grad_norm": 0.15119358897209167, "learning_rate": 0.0001, "loss": 0.1517, "step": 1787 }, { "epoch": 2.9149005542875774, "grad_norm": 0.10862945020198822, "learning_rate": 0.0001, "loss": 0.1636, "step": 1788 }, { "epoch": 2.916530811868275, "grad_norm": 0.12316179275512695, "learning_rate": 0.0001, "loss": 0.1726, "step": 1789 }, { "epoch": 2.918161069448973, "grad_norm": 0.14677046239376068, "learning_rate": 0.0001, "loss": 0.1705, "step": 1790 }, { "epoch": 2.919791327029671, "grad_norm": 0.1339065581560135, "learning_rate": 0.0001, "loss": 0.1771, "step": 1791 }, { "epoch": 2.9214215846103686, "grad_norm": 0.1291988492012024, "learning_rate": 0.0001, "loss": 0.1741, "step": 1792 }, { "epoch": 2.9230518421910663, "grad_norm": 0.13565215468406677, "learning_rate": 0.0001, "loss": 0.1692, "step": 1793 }, { "epoch": 2.924682099771764, "grad_norm": 0.13081243634223938, "learning_rate": 0.0001, "loss": 0.1622, "step": 1794 }, { "epoch": 2.9263123573524616, "grad_norm": 0.12821637094020844, "learning_rate": 0.0001, "loss": 0.1592, "step": 1795 }, { "epoch": 2.9279426149331593, "grad_norm": 0.17648915946483612, "learning_rate": 0.0001, "loss": 0.1875, "step": 1796 }, { "epoch": 2.929572872513857, "grad_norm": 0.11781003326177597, "learning_rate": 0.0001, "loss": 0.1591, "step": 1797 }, { "epoch": 2.9312031300945547, "grad_norm": 0.13282893598079681, "learning_rate": 0.0001, "loss": 0.1734, "step": 1798 }, { "epoch": 2.932833387675253, "grad_norm": 0.14549997448921204, "learning_rate": 0.0001, "loss": 0.1792, "step": 1799 }, { "epoch": 2.9344636452559505, "grad_norm": 0.12919341027736664, "learning_rate": 0.0001, "loss": 0.1682, "step": 1800 }, { "epoch": 2.936093902836648, "grad_norm": 0.12719695270061493, "learning_rate": 0.0001, "loss": 0.1824, "step": 1801 }, { "epoch": 2.937724160417346, "grad_norm": 0.14096450805664062, "learning_rate": 0.0001, "loss": 0.1609, "step": 1802 }, { "epoch": 2.9393544179980435, "grad_norm": 0.11929788440465927, "learning_rate": 0.0001, "loss": 0.1602, "step": 1803 }, { "epoch": 2.9409846755787417, "grad_norm": 0.16668562591075897, "learning_rate": 0.0001, "loss": 0.1767, "step": 1804 }, { "epoch": 2.9426149331594393, "grad_norm": 0.14096197485923767, "learning_rate": 0.0001, "loss": 0.1696, "step": 1805 }, { "epoch": 2.944245190740137, "grad_norm": 0.1540384292602539, "learning_rate": 0.0001, "loss": 0.1849, "step": 1806 }, { "epoch": 2.9458754483208347, "grad_norm": 0.11548353731632233, "learning_rate": 0.0001, "loss": 0.1676, "step": 1807 }, { "epoch": 2.9475057059015324, "grad_norm": 0.12311840057373047, "learning_rate": 0.0001, "loss": 0.1664, "step": 1808 }, { "epoch": 2.94913596348223, "grad_norm": 0.1358504742383957, "learning_rate": 0.0001, "loss": 0.1752, "step": 1809 }, { "epoch": 2.9507662210629277, "grad_norm": 0.12379997223615646, "learning_rate": 0.0001, "loss": 0.1656, "step": 1810 }, { "epoch": 2.9523964786436254, "grad_norm": 0.13933683931827545, "learning_rate": 0.0001, "loss": 0.1797, "step": 1811 }, { "epoch": 2.9540267362243235, "grad_norm": 0.1396905779838562, "learning_rate": 0.0001, "loss": 0.1761, "step": 1812 }, { "epoch": 2.9556569938050212, "grad_norm": 0.12376896291971207, "learning_rate": 0.0001, "loss": 0.1608, "step": 1813 }, { "epoch": 2.957287251385719, "grad_norm": 0.12583288550376892, "learning_rate": 0.0001, "loss": 0.1684, "step": 1814 }, { "epoch": 2.9589175089664166, "grad_norm": 0.13755297660827637, "learning_rate": 0.0001, "loss": 0.144, "step": 1815 }, { "epoch": 2.9605477665471147, "grad_norm": 0.24326035380363464, "learning_rate": 0.0001, "loss": 0.1885, "step": 1816 }, { "epoch": 2.9621780241278124, "grad_norm": 0.14443641901016235, "learning_rate": 0.0001, "loss": 0.1885, "step": 1817 }, { "epoch": 2.96380828170851, "grad_norm": 0.12700864672660828, "learning_rate": 0.0001, "loss": 0.1641, "step": 1818 }, { "epoch": 2.9654385392892078, "grad_norm": 0.148148775100708, "learning_rate": 0.0001, "loss": 0.1792, "step": 1819 }, { "epoch": 2.9670687968699054, "grad_norm": 0.14224189519882202, "learning_rate": 0.0001, "loss": 0.1692, "step": 1820 }, { "epoch": 2.968699054450603, "grad_norm": 0.15919244289398193, "learning_rate": 0.0001, "loss": 0.174, "step": 1821 }, { "epoch": 2.970329312031301, "grad_norm": 0.2009061574935913, "learning_rate": 0.0001, "loss": 0.1737, "step": 1822 }, { "epoch": 2.9719595696119985, "grad_norm": 0.12979143857955933, "learning_rate": 0.0001, "loss": 0.1668, "step": 1823 }, { "epoch": 2.9735898271926966, "grad_norm": 0.12879891693592072, "learning_rate": 0.0001, "loss": 0.1608, "step": 1824 }, { "epoch": 2.9752200847733943, "grad_norm": 0.11508017033338547, "learning_rate": 0.0001, "loss": 0.1705, "step": 1825 }, { "epoch": 2.976850342354092, "grad_norm": 0.12390004843473434, "learning_rate": 0.0001, "loss": 0.1697, "step": 1826 }, { "epoch": 2.9784805999347896, "grad_norm": 0.13983500003814697, "learning_rate": 0.0001, "loss": 0.1744, "step": 1827 }, { "epoch": 2.9801108575154873, "grad_norm": 0.12721027433872223, "learning_rate": 0.0001, "loss": 0.1799, "step": 1828 }, { "epoch": 2.9817411150961854, "grad_norm": 0.12273556739091873, "learning_rate": 0.0001, "loss": 0.1664, "step": 1829 }, { "epoch": 2.983371372676883, "grad_norm": 0.12730666995048523, "learning_rate": 0.0001, "loss": 0.1716, "step": 1830 }, { "epoch": 2.985001630257581, "grad_norm": 0.12331203371286392, "learning_rate": 0.0001, "loss": 0.1641, "step": 1831 }, { "epoch": 2.9866318878382785, "grad_norm": 0.1258230060338974, "learning_rate": 0.0001, "loss": 0.1778, "step": 1832 }, { "epoch": 2.988262145418976, "grad_norm": 0.14592202007770538, "learning_rate": 0.0001, "loss": 0.1888, "step": 1833 }, { "epoch": 2.989892402999674, "grad_norm": 0.12880602478981018, "learning_rate": 0.0001, "loss": 0.1731, "step": 1834 }, { "epoch": 2.9915226605803715, "grad_norm": 0.14615651965141296, "learning_rate": 0.0001, "loss": 0.1853, "step": 1835 }, { "epoch": 2.993152918161069, "grad_norm": 0.12081672251224518, "learning_rate": 0.0001, "loss": 0.1837, "step": 1836 }, { "epoch": 2.9947831757417673, "grad_norm": 0.11534418165683746, "learning_rate": 0.0001, "loss": 0.1773, "step": 1837 }, { "epoch": 2.996413433322465, "grad_norm": 0.12042343616485596, "learning_rate": 0.0001, "loss": 0.1594, "step": 1838 }, { "epoch": 2.9980436909031627, "grad_norm": 0.13987913727760315, "learning_rate": 0.0001, "loss": 0.1785, "step": 1839 }, { "epoch": 2.9996739484838604, "grad_norm": 0.12604045867919922, "learning_rate": 0.0001, "loss": 0.1708, "step": 1840 }, { "epoch": 3.001304206064558, "grad_norm": 0.13217991590499878, "learning_rate": 0.0001, "loss": 0.1621, "step": 1841 }, { "epoch": 3.0029344636452557, "grad_norm": 0.1318402737379074, "learning_rate": 0.0001, "loss": 0.1565, "step": 1842 }, { "epoch": 3.004564721225954, "grad_norm": 0.12177060544490814, "learning_rate": 0.0001, "loss": 0.162, "step": 1843 }, { "epoch": 3.0061949788066515, "grad_norm": 0.11984521895647049, "learning_rate": 0.0001, "loss": 0.1441, "step": 1844 }, { "epoch": 3.007825236387349, "grad_norm": 0.13043449819087982, "learning_rate": 0.0001, "loss": 0.1419, "step": 1845 }, { "epoch": 3.009455493968047, "grad_norm": 0.12092608213424683, "learning_rate": 0.0001, "loss": 0.1503, "step": 1846 }, { "epoch": 3.0110857515487446, "grad_norm": 0.15196233987808228, "learning_rate": 0.0001, "loss": 0.161, "step": 1847 }, { "epoch": 3.0127160091294423, "grad_norm": 0.14589709043502808, "learning_rate": 0.0001, "loss": 0.1564, "step": 1848 }, { "epoch": 3.0143462667101404, "grad_norm": 0.1587577909231186, "learning_rate": 0.0001, "loss": 0.1673, "step": 1849 }, { "epoch": 3.015976524290838, "grad_norm": 0.14549551904201508, "learning_rate": 0.0001, "loss": 0.1573, "step": 1850 }, { "epoch": 3.0176067818715357, "grad_norm": 0.1457890272140503, "learning_rate": 0.0001, "loss": 0.1552, "step": 1851 }, { "epoch": 3.0192370394522334, "grad_norm": 0.1330796182155609, "learning_rate": 0.0001, "loss": 0.1522, "step": 1852 }, { "epoch": 3.020867297032931, "grad_norm": 0.16735856235027313, "learning_rate": 0.0001, "loss": 0.1618, "step": 1853 }, { "epoch": 3.022497554613629, "grad_norm": 0.13368208706378937, "learning_rate": 0.0001, "loss": 0.1522, "step": 1854 }, { "epoch": 3.024127812194327, "grad_norm": 0.14208734035491943, "learning_rate": 0.0001, "loss": 0.1537, "step": 1855 }, { "epoch": 3.0257580697750246, "grad_norm": 0.17149995267391205, "learning_rate": 0.0001, "loss": 0.1688, "step": 1856 }, { "epoch": 3.0273883273557223, "grad_norm": 0.16402693092823029, "learning_rate": 0.0001, "loss": 0.153, "step": 1857 }, { "epoch": 3.02901858493642, "grad_norm": 0.14791880548000336, "learning_rate": 0.0001, "loss": 0.1415, "step": 1858 }, { "epoch": 3.0306488425171176, "grad_norm": 0.13278983533382416, "learning_rate": 0.0001, "loss": 0.1624, "step": 1859 }, { "epoch": 3.0322791000978153, "grad_norm": 0.1486644297838211, "learning_rate": 0.0001, "loss": 0.1489, "step": 1860 }, { "epoch": 3.033909357678513, "grad_norm": 0.15381312370300293, "learning_rate": 0.0001, "loss": 0.1546, "step": 1861 }, { "epoch": 3.035539615259211, "grad_norm": 0.1434403508901596, "learning_rate": 0.0001, "loss": 0.1595, "step": 1862 }, { "epoch": 3.037169872839909, "grad_norm": 0.14210708439350128, "learning_rate": 0.0001, "loss": 0.1533, "step": 1863 }, { "epoch": 3.0388001304206065, "grad_norm": 0.1316472291946411, "learning_rate": 0.0001, "loss": 0.1406, "step": 1864 }, { "epoch": 3.040430388001304, "grad_norm": 0.17375947535037994, "learning_rate": 0.0001, "loss": 0.1727, "step": 1865 }, { "epoch": 3.042060645582002, "grad_norm": 0.13698440790176392, "learning_rate": 0.0001, "loss": 0.1458, "step": 1866 }, { "epoch": 3.0436909031626995, "grad_norm": 0.14209020137786865, "learning_rate": 0.0001, "loss": 0.1467, "step": 1867 }, { "epoch": 3.0453211607433976, "grad_norm": 0.14422006905078888, "learning_rate": 0.0001, "loss": 0.1502, "step": 1868 }, { "epoch": 3.0469514183240953, "grad_norm": 0.13612805306911469, "learning_rate": 0.0001, "loss": 0.1519, "step": 1869 }, { "epoch": 3.048581675904793, "grad_norm": 0.16070745885372162, "learning_rate": 0.0001, "loss": 0.1606, "step": 1870 }, { "epoch": 3.0502119334854907, "grad_norm": 0.1366911679506302, "learning_rate": 0.0001, "loss": 0.1532, "step": 1871 }, { "epoch": 3.0518421910661884, "grad_norm": 0.1536417007446289, "learning_rate": 0.0001, "loss": 0.1624, "step": 1872 }, { "epoch": 3.053472448646886, "grad_norm": 0.14232809841632843, "learning_rate": 0.0001, "loss": 0.161, "step": 1873 }, { "epoch": 3.055102706227584, "grad_norm": 0.13991610705852509, "learning_rate": 0.0001, "loss": 0.1451, "step": 1874 }, { "epoch": 3.056732963808282, "grad_norm": 0.13531944155693054, "learning_rate": 0.0001, "loss": 0.1551, "step": 1875 }, { "epoch": 3.0583632213889795, "grad_norm": 0.1320919394493103, "learning_rate": 0.0001, "loss": 0.1637, "step": 1876 }, { "epoch": 3.059993478969677, "grad_norm": 0.1661989539861679, "learning_rate": 0.0001, "loss": 0.168, "step": 1877 }, { "epoch": 3.061623736550375, "grad_norm": 0.14109432697296143, "learning_rate": 0.0001, "loss": 0.1594, "step": 1878 }, { "epoch": 3.0632539941310726, "grad_norm": 0.12099778652191162, "learning_rate": 0.0001, "loss": 0.1361, "step": 1879 }, { "epoch": 3.0648842517117703, "grad_norm": 0.15160542726516724, "learning_rate": 0.0001, "loss": 0.1442, "step": 1880 }, { "epoch": 3.0665145092924684, "grad_norm": 0.15511579811573029, "learning_rate": 0.0001, "loss": 0.1517, "step": 1881 }, { "epoch": 3.068144766873166, "grad_norm": 0.15297721326351166, "learning_rate": 0.0001, "loss": 0.1484, "step": 1882 }, { "epoch": 3.0697750244538637, "grad_norm": 0.15919755399227142, "learning_rate": 0.0001, "loss": 0.1538, "step": 1883 }, { "epoch": 3.0714052820345614, "grad_norm": 0.16629868745803833, "learning_rate": 0.0001, "loss": 0.1727, "step": 1884 }, { "epoch": 3.073035539615259, "grad_norm": 0.15279151499271393, "learning_rate": 0.0001, "loss": 0.167, "step": 1885 }, { "epoch": 3.074665797195957, "grad_norm": 0.13714101910591125, "learning_rate": 0.0001, "loss": 0.1564, "step": 1886 }, { "epoch": 3.076296054776655, "grad_norm": 0.13693886995315552, "learning_rate": 0.0001, "loss": 0.1528, "step": 1887 }, { "epoch": 3.0779263123573526, "grad_norm": 0.1455615609884262, "learning_rate": 0.0001, "loss": 0.1606, "step": 1888 }, { "epoch": 3.0795565699380503, "grad_norm": 0.12297336757183075, "learning_rate": 0.0001, "loss": 0.1513, "step": 1889 }, { "epoch": 3.081186827518748, "grad_norm": 0.13729313015937805, "learning_rate": 0.0001, "loss": 0.1594, "step": 1890 }, { "epoch": 3.0828170850994456, "grad_norm": 0.15476392209529877, "learning_rate": 0.0001, "loss": 0.1646, "step": 1891 }, { "epoch": 3.0844473426801433, "grad_norm": 0.13134579360485077, "learning_rate": 0.0001, "loss": 0.145, "step": 1892 }, { "epoch": 3.0860776002608414, "grad_norm": 0.12954765558242798, "learning_rate": 0.0001, "loss": 0.1463, "step": 1893 }, { "epoch": 3.087707857841539, "grad_norm": 0.13961279392242432, "learning_rate": 0.0001, "loss": 0.1502, "step": 1894 }, { "epoch": 3.089338115422237, "grad_norm": 0.1431739330291748, "learning_rate": 0.0001, "loss": 0.1542, "step": 1895 }, { "epoch": 3.0909683730029345, "grad_norm": 0.13088122010231018, "learning_rate": 0.0001, "loss": 0.1418, "step": 1896 }, { "epoch": 3.092598630583632, "grad_norm": 0.14733296632766724, "learning_rate": 0.0001, "loss": 0.1506, "step": 1897 }, { "epoch": 3.09422888816433, "grad_norm": 0.14778561890125275, "learning_rate": 0.0001, "loss": 0.1492, "step": 1898 }, { "epoch": 3.0958591457450275, "grad_norm": 0.17354664206504822, "learning_rate": 0.0001, "loss": 0.1725, "step": 1899 }, { "epoch": 3.0974894033257256, "grad_norm": 0.17014777660369873, "learning_rate": 0.0001, "loss": 0.1653, "step": 1900 }, { "epoch": 3.0991196609064233, "grad_norm": 0.1588321030139923, "learning_rate": 0.0001, "loss": 0.1618, "step": 1901 }, { "epoch": 3.100749918487121, "grad_norm": 0.1591597944498062, "learning_rate": 0.0001, "loss": 0.1574, "step": 1902 }, { "epoch": 3.1023801760678187, "grad_norm": 0.1335812509059906, "learning_rate": 0.0001, "loss": 0.1508, "step": 1903 }, { "epoch": 3.1040104336485164, "grad_norm": 0.13163426518440247, "learning_rate": 0.0001, "loss": 0.134, "step": 1904 }, { "epoch": 3.105640691229214, "grad_norm": 0.17262452840805054, "learning_rate": 0.0001, "loss": 0.1662, "step": 1905 }, { "epoch": 3.107270948809912, "grad_norm": 0.1346106082201004, "learning_rate": 0.0001, "loss": 0.1519, "step": 1906 }, { "epoch": 3.10890120639061, "grad_norm": 0.15132804214954376, "learning_rate": 0.0001, "loss": 0.1567, "step": 1907 }, { "epoch": 3.1105314639713075, "grad_norm": 0.14656689763069153, "learning_rate": 0.0001, "loss": 0.174, "step": 1908 }, { "epoch": 3.112161721552005, "grad_norm": 0.12101754546165466, "learning_rate": 0.0001, "loss": 0.1313, "step": 1909 }, { "epoch": 3.113791979132703, "grad_norm": 0.13873204588890076, "learning_rate": 0.0001, "loss": 0.1685, "step": 1910 }, { "epoch": 3.1154222367134006, "grad_norm": 0.14690999686717987, "learning_rate": 0.0001, "loss": 0.1522, "step": 1911 }, { "epoch": 3.1170524942940983, "grad_norm": 0.14875850081443787, "learning_rate": 0.0001, "loss": 0.1669, "step": 1912 }, { "epoch": 3.1186827518747964, "grad_norm": 0.12188220769166946, "learning_rate": 0.0001, "loss": 0.1423, "step": 1913 }, { "epoch": 3.120313009455494, "grad_norm": 0.14481259882450104, "learning_rate": 0.0001, "loss": 0.1567, "step": 1914 }, { "epoch": 3.1219432670361917, "grad_norm": 0.1382434368133545, "learning_rate": 0.0001, "loss": 0.1495, "step": 1915 }, { "epoch": 3.1235735246168894, "grad_norm": 0.14566384255886078, "learning_rate": 0.0001, "loss": 0.1476, "step": 1916 }, { "epoch": 3.125203782197587, "grad_norm": 0.14900517463684082, "learning_rate": 0.0001, "loss": 0.1642, "step": 1917 }, { "epoch": 3.126834039778285, "grad_norm": 0.1237114742398262, "learning_rate": 0.0001, "loss": 0.1387, "step": 1918 }, { "epoch": 3.128464297358983, "grad_norm": 0.13117025792598724, "learning_rate": 0.0001, "loss": 0.1515, "step": 1919 }, { "epoch": 3.1300945549396806, "grad_norm": 0.13853514194488525, "learning_rate": 0.0001, "loss": 0.1496, "step": 1920 }, { "epoch": 3.1317248125203783, "grad_norm": 0.17825017869472504, "learning_rate": 0.0001, "loss": 0.1578, "step": 1921 }, { "epoch": 3.133355070101076, "grad_norm": 0.1505771279335022, "learning_rate": 0.0001, "loss": 0.1535, "step": 1922 }, { "epoch": 3.1349853276817736, "grad_norm": 0.1433846801519394, "learning_rate": 0.0001, "loss": 0.1572, "step": 1923 }, { "epoch": 3.1366155852624713, "grad_norm": 0.1596456617116928, "learning_rate": 0.0001, "loss": 0.1604, "step": 1924 }, { "epoch": 3.138245842843169, "grad_norm": 0.14370723068714142, "learning_rate": 0.0001, "loss": 0.1608, "step": 1925 }, { "epoch": 3.139876100423867, "grad_norm": 0.16248880326747894, "learning_rate": 0.0001, "loss": 0.171, "step": 1926 }, { "epoch": 3.141506358004565, "grad_norm": 0.1393871307373047, "learning_rate": 0.0001, "loss": 0.1605, "step": 1927 }, { "epoch": 3.1431366155852625, "grad_norm": 0.12134724855422974, "learning_rate": 0.0001, "loss": 0.1594, "step": 1928 }, { "epoch": 3.14476687316596, "grad_norm": 0.16671213507652283, "learning_rate": 0.0001, "loss": 0.1675, "step": 1929 }, { "epoch": 3.146397130746658, "grad_norm": 0.7389642596244812, "learning_rate": 0.0001, "loss": 0.1587, "step": 1930 }, { "epoch": 3.148027388327356, "grad_norm": 0.1277475357055664, "learning_rate": 0.0001, "loss": 0.1467, "step": 1931 }, { "epoch": 3.1496576459080536, "grad_norm": 0.14921234548091888, "learning_rate": 0.0001, "loss": 0.1521, "step": 1932 }, { "epoch": 3.1512879034887513, "grad_norm": 0.14741483330726624, "learning_rate": 0.0001, "loss": 0.158, "step": 1933 }, { "epoch": 3.152918161069449, "grad_norm": 0.14746753871440887, "learning_rate": 0.0001, "loss": 0.1479, "step": 1934 }, { "epoch": 3.1545484186501467, "grad_norm": 0.13620176911354065, "learning_rate": 0.0001, "loss": 0.1499, "step": 1935 }, { "epoch": 3.1561786762308444, "grad_norm": 1.9854735136032104, "learning_rate": 0.0001, "loss": 0.1762, "step": 1936 }, { "epoch": 3.157808933811542, "grad_norm": 0.1485620141029358, "learning_rate": 0.0001, "loss": 0.1523, "step": 1937 }, { "epoch": 3.15943919139224, "grad_norm": 0.1689678281545639, "learning_rate": 0.0001, "loss": 0.1563, "step": 1938 }, { "epoch": 3.161069448972938, "grad_norm": 0.14907783269882202, "learning_rate": 0.0001, "loss": 0.1649, "step": 1939 }, { "epoch": 3.1626997065536355, "grad_norm": 0.15304021537303925, "learning_rate": 0.0001, "loss": 0.1689, "step": 1940 }, { "epoch": 3.164329964134333, "grad_norm": 0.16715513169765472, "learning_rate": 0.0001, "loss": 0.1601, "step": 1941 }, { "epoch": 3.165960221715031, "grad_norm": 0.15324808657169342, "learning_rate": 0.0001, "loss": 0.1612, "step": 1942 }, { "epoch": 3.1675904792957286, "grad_norm": 0.14634795486927032, "learning_rate": 0.0001, "loss": 0.1568, "step": 1943 }, { "epoch": 3.1692207368764267, "grad_norm": 0.15744540095329285, "learning_rate": 0.0001, "loss": 0.1552, "step": 1944 }, { "epoch": 3.1708509944571244, "grad_norm": 0.15544916689395905, "learning_rate": 0.0001, "loss": 0.16, "step": 1945 }, { "epoch": 3.172481252037822, "grad_norm": 0.14175128936767578, "learning_rate": 0.0001, "loss": 0.1639, "step": 1946 }, { "epoch": 3.1741115096185197, "grad_norm": 0.14051491022109985, "learning_rate": 0.0001, "loss": 0.1379, "step": 1947 }, { "epoch": 3.1757417671992174, "grad_norm": 0.14642100036144257, "learning_rate": 0.0001, "loss": 0.1535, "step": 1948 }, { "epoch": 3.177372024779915, "grad_norm": 0.1421380490064621, "learning_rate": 0.0001, "loss": 0.1599, "step": 1949 }, { "epoch": 3.1790022823606128, "grad_norm": 0.15691342949867249, "learning_rate": 0.0001, "loss": 0.1589, "step": 1950 }, { "epoch": 3.180632539941311, "grad_norm": 0.14704185724258423, "learning_rate": 0.0001, "loss": 0.1508, "step": 1951 }, { "epoch": 3.1822627975220086, "grad_norm": 0.15139469504356384, "learning_rate": 0.0001, "loss": 0.151, "step": 1952 }, { "epoch": 3.1838930551027063, "grad_norm": 0.15881513059139252, "learning_rate": 0.0001, "loss": 0.1604, "step": 1953 }, { "epoch": 3.185523312683404, "grad_norm": 0.16441747546195984, "learning_rate": 0.0001, "loss": 0.1677, "step": 1954 }, { "epoch": 3.1871535702641016, "grad_norm": 0.17274102568626404, "learning_rate": 0.0001, "loss": 0.1674, "step": 1955 }, { "epoch": 3.1887838278447993, "grad_norm": 0.13821010291576385, "learning_rate": 0.0001, "loss": 0.1478, "step": 1956 }, { "epoch": 3.1904140854254974, "grad_norm": 0.14471560716629028, "learning_rate": 0.0001, "loss": 0.1524, "step": 1957 }, { "epoch": 3.192044343006195, "grad_norm": 0.14315104484558105, "learning_rate": 0.0001, "loss": 0.1528, "step": 1958 }, { "epoch": 3.193674600586893, "grad_norm": 0.15579240024089813, "learning_rate": 0.0001, "loss": 0.1575, "step": 1959 }, { "epoch": 3.1953048581675905, "grad_norm": 0.16074621677398682, "learning_rate": 0.0001, "loss": 0.1498, "step": 1960 }, { "epoch": 3.196935115748288, "grad_norm": 0.14559932053089142, "learning_rate": 0.0001, "loss": 0.1525, "step": 1961 }, { "epoch": 3.198565373328986, "grad_norm": 0.146773099899292, "learning_rate": 0.0001, "loss": 0.1568, "step": 1962 }, { "epoch": 3.2001956309096835, "grad_norm": 0.1405738741159439, "learning_rate": 0.0001, "loss": 0.1553, "step": 1963 }, { "epoch": 3.2018258884903816, "grad_norm": 0.1669120341539383, "learning_rate": 0.0001, "loss": 0.1652, "step": 1964 }, { "epoch": 3.2034561460710793, "grad_norm": 0.14730647206306458, "learning_rate": 0.0001, "loss": 0.1532, "step": 1965 }, { "epoch": 3.205086403651777, "grad_norm": 0.15380671620368958, "learning_rate": 0.0001, "loss": 0.1585, "step": 1966 }, { "epoch": 3.2067166612324747, "grad_norm": 0.1475609540939331, "learning_rate": 0.0001, "loss": 0.1533, "step": 1967 }, { "epoch": 3.2083469188131724, "grad_norm": 0.15261127054691315, "learning_rate": 0.0001, "loss": 0.1528, "step": 1968 }, { "epoch": 3.20997717639387, "grad_norm": 0.13426612317562103, "learning_rate": 0.0001, "loss": 0.1459, "step": 1969 }, { "epoch": 3.211607433974568, "grad_norm": 0.16383317112922668, "learning_rate": 0.0001, "loss": 0.1457, "step": 1970 }, { "epoch": 3.213237691555266, "grad_norm": 0.12917453050613403, "learning_rate": 0.0001, "loss": 0.1447, "step": 1971 }, { "epoch": 3.2148679491359635, "grad_norm": 0.14621089398860931, "learning_rate": 0.0001, "loss": 0.1425, "step": 1972 }, { "epoch": 3.216498206716661, "grad_norm": 0.14736516773700714, "learning_rate": 0.0001, "loss": 0.1542, "step": 1973 }, { "epoch": 3.218128464297359, "grad_norm": 0.18349121510982513, "learning_rate": 0.0001, "loss": 0.1581, "step": 1974 }, { "epoch": 3.2197587218780566, "grad_norm": 0.13845403492450714, "learning_rate": 0.0001, "loss": 0.1531, "step": 1975 }, { "epoch": 3.2213889794587547, "grad_norm": 0.17356093227863312, "learning_rate": 0.0001, "loss": 0.1499, "step": 1976 }, { "epoch": 3.2230192370394524, "grad_norm": 0.13243253529071808, "learning_rate": 0.0001, "loss": 0.1551, "step": 1977 }, { "epoch": 3.22464949462015, "grad_norm": 0.16005612909793854, "learning_rate": 0.0001, "loss": 0.1485, "step": 1978 }, { "epoch": 3.2262797522008477, "grad_norm": 0.150725856423378, "learning_rate": 0.0001, "loss": 0.1598, "step": 1979 }, { "epoch": 3.2279100097815454, "grad_norm": 0.1280834674835205, "learning_rate": 0.0001, "loss": 0.1508, "step": 1980 }, { "epoch": 3.229540267362243, "grad_norm": 0.1567755490541458, "learning_rate": 0.0001, "loss": 0.1561, "step": 1981 }, { "epoch": 3.231170524942941, "grad_norm": 0.13241910934448242, "learning_rate": 0.0001, "loss": 0.1483, "step": 1982 }, { "epoch": 3.232800782523639, "grad_norm": 0.1435912549495697, "learning_rate": 0.0001, "loss": 0.1466, "step": 1983 }, { "epoch": 3.2344310401043366, "grad_norm": 0.1577858179807663, "learning_rate": 0.0001, "loss": 0.1617, "step": 1984 }, { "epoch": 3.2360612976850343, "grad_norm": 0.14068260788917542, "learning_rate": 0.0001, "loss": 0.1511, "step": 1985 }, { "epoch": 3.237691555265732, "grad_norm": 0.1652795672416687, "learning_rate": 0.0001, "loss": 0.1744, "step": 1986 }, { "epoch": 3.2393218128464296, "grad_norm": 0.14117799699306488, "learning_rate": 0.0001, "loss": 0.1525, "step": 1987 }, { "epoch": 3.2409520704271273, "grad_norm": 0.18951740860939026, "learning_rate": 0.0001, "loss": 0.1491, "step": 1988 }, { "epoch": 3.2425823280078254, "grad_norm": 0.17044013738632202, "learning_rate": 0.0001, "loss": 0.1416, "step": 1989 }, { "epoch": 3.244212585588523, "grad_norm": 0.16022269427776337, "learning_rate": 0.0001, "loss": 0.1707, "step": 1990 }, { "epoch": 3.245842843169221, "grad_norm": 0.17762519419193268, "learning_rate": 0.0001, "loss": 0.1641, "step": 1991 }, { "epoch": 3.2474731007499185, "grad_norm": 0.15027785301208496, "learning_rate": 0.0001, "loss": 0.1665, "step": 1992 }, { "epoch": 3.249103358330616, "grad_norm": 0.12501652538776398, "learning_rate": 0.0001, "loss": 0.1388, "step": 1993 }, { "epoch": 3.250733615911314, "grad_norm": 0.14507539570331573, "learning_rate": 0.0001, "loss": 0.1667, "step": 1994 }, { "epoch": 3.252363873492012, "grad_norm": 0.12410466372966766, "learning_rate": 0.0001, "loss": 0.152, "step": 1995 }, { "epoch": 3.2539941310727096, "grad_norm": 0.1520231068134308, "learning_rate": 0.0001, "loss": 0.165, "step": 1996 }, { "epoch": 3.2556243886534073, "grad_norm": 0.15757714211940765, "learning_rate": 0.0001, "loss": 0.1722, "step": 1997 }, { "epoch": 3.257254646234105, "grad_norm": 0.19692480564117432, "learning_rate": 0.0001, "loss": 0.1586, "step": 1998 }, { "epoch": 3.2588849038148027, "grad_norm": 0.1453189253807068, "learning_rate": 0.0001, "loss": 0.1524, "step": 1999 }, { "epoch": 3.2605151613955003, "grad_norm": 0.14371079206466675, "learning_rate": 0.0001, "loss": 0.1596, "step": 2000 }, { "epoch": 3.2605151613955003, "eval_loss": 0.19923283159732819, "eval_runtime": 2814.5584, "eval_samples_per_second": 0.67, "eval_steps_per_second": 0.168, "step": 2000 }, { "epoch": 3.262145418976198, "grad_norm": 0.15101583302021027, "learning_rate": 0.0001, "loss": 0.1628, "step": 2001 }, { "epoch": 3.263775676556896, "grad_norm": 0.14068691432476044, "learning_rate": 0.0001, "loss": 0.1411, "step": 2002 }, { "epoch": 3.265405934137594, "grad_norm": 0.1584228277206421, "learning_rate": 0.0001, "loss": 0.1623, "step": 2003 }, { "epoch": 3.2670361917182915, "grad_norm": 0.14234709739685059, "learning_rate": 0.0001, "loss": 0.1547, "step": 2004 }, { "epoch": 3.268666449298989, "grad_norm": 0.13787250220775604, "learning_rate": 0.0001, "loss": 0.1578, "step": 2005 }, { "epoch": 3.270296706879687, "grad_norm": 0.13636542856693268, "learning_rate": 0.0001, "loss": 0.1501, "step": 2006 }, { "epoch": 3.2719269644603846, "grad_norm": 0.1498252898454666, "learning_rate": 0.0001, "loss": 0.1628, "step": 2007 }, { "epoch": 3.2735572220410827, "grad_norm": 0.15761011838912964, "learning_rate": 0.0001, "loss": 0.1533, "step": 2008 }, { "epoch": 3.2751874796217804, "grad_norm": 0.13434697687625885, "learning_rate": 0.0001, "loss": 0.1681, "step": 2009 }, { "epoch": 3.276817737202478, "grad_norm": 0.1439303308725357, "learning_rate": 0.0001, "loss": 0.1577, "step": 2010 }, { "epoch": 3.2784479947831757, "grad_norm": 0.1279498040676117, "learning_rate": 0.0001, "loss": 0.15, "step": 2011 }, { "epoch": 3.2800782523638734, "grad_norm": 0.17215445637702942, "learning_rate": 0.0001, "loss": 0.1632, "step": 2012 }, { "epoch": 3.281708509944571, "grad_norm": 0.17374494671821594, "learning_rate": 0.0001, "loss": 0.1626, "step": 2013 }, { "epoch": 3.2833387675252688, "grad_norm": 0.14224795997142792, "learning_rate": 0.0001, "loss": 0.1463, "step": 2014 }, { "epoch": 3.284969025105967, "grad_norm": 0.13347376883029938, "learning_rate": 0.0001, "loss": 0.1594, "step": 2015 }, { "epoch": 3.2865992826866646, "grad_norm": 0.12881265580654144, "learning_rate": 0.0001, "loss": 0.1569, "step": 2016 }, { "epoch": 3.2882295402673622, "grad_norm": 0.14080087840557098, "learning_rate": 0.0001, "loss": 0.1522, "step": 2017 }, { "epoch": 3.28985979784806, "grad_norm": 0.16095654666423798, "learning_rate": 0.0001, "loss": 0.1608, "step": 2018 }, { "epoch": 3.2914900554287576, "grad_norm": 0.11454547941684723, "learning_rate": 0.0001, "loss": 0.1356, "step": 2019 }, { "epoch": 3.2931203130094557, "grad_norm": 0.13725461065769196, "learning_rate": 0.0001, "loss": 0.1502, "step": 2020 }, { "epoch": 3.2947505705901534, "grad_norm": 0.22002211213111877, "learning_rate": 0.0001, "loss": 0.1513, "step": 2021 }, { "epoch": 3.296380828170851, "grad_norm": 0.15117506682872772, "learning_rate": 0.0001, "loss": 0.1673, "step": 2022 }, { "epoch": 3.2980110857515488, "grad_norm": 0.14475668966770172, "learning_rate": 0.0001, "loss": 0.157, "step": 2023 }, { "epoch": 3.2996413433322465, "grad_norm": 0.13530389964580536, "learning_rate": 0.0001, "loss": 0.1447, "step": 2024 }, { "epoch": 3.301271600912944, "grad_norm": 0.14106786251068115, "learning_rate": 0.0001, "loss": 0.1565, "step": 2025 }, { "epoch": 3.302901858493642, "grad_norm": 0.16226093471050262, "learning_rate": 0.0001, "loss": 0.1623, "step": 2026 }, { "epoch": 3.3045321160743395, "grad_norm": 0.1750541478395462, "learning_rate": 0.0001, "loss": 0.1672, "step": 2027 }, { "epoch": 3.3061623736550376, "grad_norm": 0.15967412292957306, "learning_rate": 0.0001, "loss": 0.1487, "step": 2028 }, { "epoch": 3.3077926312357353, "grad_norm": 0.18519139289855957, "learning_rate": 0.0001, "loss": 0.1666, "step": 2029 }, { "epoch": 3.309422888816433, "grad_norm": 0.14711232483386993, "learning_rate": 0.0001, "loss": 0.1617, "step": 2030 }, { "epoch": 3.3110531463971307, "grad_norm": 0.14961479604244232, "learning_rate": 0.0001, "loss": 0.149, "step": 2031 }, { "epoch": 3.3126834039778283, "grad_norm": 0.16532696783542633, "learning_rate": 0.0001, "loss": 0.157, "step": 2032 }, { "epoch": 3.3143136615585265, "grad_norm": 0.1479751318693161, "learning_rate": 0.0001, "loss": 0.164, "step": 2033 }, { "epoch": 3.315943919139224, "grad_norm": 0.16249758005142212, "learning_rate": 0.0001, "loss": 0.1607, "step": 2034 }, { "epoch": 3.317574176719922, "grad_norm": 0.15464435517787933, "learning_rate": 0.0001, "loss": 0.1514, "step": 2035 }, { "epoch": 3.3192044343006195, "grad_norm": 0.12793324887752533, "learning_rate": 0.0001, "loss": 0.1558, "step": 2036 }, { "epoch": 3.320834691881317, "grad_norm": 0.1387512981891632, "learning_rate": 0.0001, "loss": 0.1568, "step": 2037 }, { "epoch": 3.322464949462015, "grad_norm": 0.14236465096473694, "learning_rate": 0.0001, "loss": 0.1571, "step": 2038 }, { "epoch": 3.3240952070427126, "grad_norm": 0.12923693656921387, "learning_rate": 0.0001, "loss": 0.1368, "step": 2039 }, { "epoch": 3.3257254646234107, "grad_norm": 0.14453841745853424, "learning_rate": 0.0001, "loss": 0.1476, "step": 2040 }, { "epoch": 3.3273557222041084, "grad_norm": 0.13822996616363525, "learning_rate": 0.0001, "loss": 0.1458, "step": 2041 }, { "epoch": 3.328985979784806, "grad_norm": 0.1528143286705017, "learning_rate": 0.0001, "loss": 0.1694, "step": 2042 }, { "epoch": 3.3306162373655037, "grad_norm": 0.14497984945774078, "learning_rate": 0.0001, "loss": 0.1446, "step": 2043 }, { "epoch": 3.3322464949462014, "grad_norm": 0.16992256045341492, "learning_rate": 0.0001, "loss": 0.1561, "step": 2044 }, { "epoch": 3.333876752526899, "grad_norm": 0.16492198407649994, "learning_rate": 0.0001, "loss": 0.1653, "step": 2045 }, { "epoch": 3.335507010107597, "grad_norm": 0.14697159826755524, "learning_rate": 0.0001, "loss": 0.145, "step": 2046 }, { "epoch": 3.337137267688295, "grad_norm": 0.1527179777622223, "learning_rate": 0.0001, "loss": 0.154, "step": 2047 }, { "epoch": 3.3387675252689926, "grad_norm": 0.14120477437973022, "learning_rate": 0.0001, "loss": 0.1583, "step": 2048 }, { "epoch": 3.3403977828496902, "grad_norm": 0.13726529479026794, "learning_rate": 0.0001, "loss": 0.1491, "step": 2049 }, { "epoch": 3.342028040430388, "grad_norm": 0.13249808549880981, "learning_rate": 0.0001, "loss": 0.1615, "step": 2050 }, { "epoch": 3.3436582980110856, "grad_norm": 0.14583012461662292, "learning_rate": 0.0001, "loss": 0.1515, "step": 2051 }, { "epoch": 3.3452885555917833, "grad_norm": 0.12669849395751953, "learning_rate": 0.0001, "loss": 0.136, "step": 2052 }, { "epoch": 3.3469188131724814, "grad_norm": 0.15459409356117249, "learning_rate": 0.0001, "loss": 0.1408, "step": 2053 }, { "epoch": 3.348549070753179, "grad_norm": 0.15946894884109497, "learning_rate": 0.0001, "loss": 0.1532, "step": 2054 }, { "epoch": 3.3501793283338768, "grad_norm": 0.15201634168624878, "learning_rate": 0.0001, "loss": 0.1593, "step": 2055 }, { "epoch": 3.3518095859145745, "grad_norm": 0.15227296948432922, "learning_rate": 0.0001, "loss": 0.1444, "step": 2056 }, { "epoch": 3.353439843495272, "grad_norm": 0.15703243017196655, "learning_rate": 0.0001, "loss": 0.1547, "step": 2057 }, { "epoch": 3.35507010107597, "grad_norm": 0.14738823473453522, "learning_rate": 0.0001, "loss": 0.1587, "step": 2058 }, { "epoch": 3.356700358656668, "grad_norm": 0.1499371975660324, "learning_rate": 0.0001, "loss": 0.1602, "step": 2059 }, { "epoch": 3.3583306162373656, "grad_norm": 0.15711835026741028, "learning_rate": 0.0001, "loss": 0.1648, "step": 2060 }, { "epoch": 3.3599608738180633, "grad_norm": 0.13238079845905304, "learning_rate": 0.0001, "loss": 0.1369, "step": 2061 }, { "epoch": 3.361591131398761, "grad_norm": 0.13998901844024658, "learning_rate": 0.0001, "loss": 0.1578, "step": 2062 }, { "epoch": 3.3632213889794587, "grad_norm": 0.13403183221817017, "learning_rate": 0.0001, "loss": 0.1491, "step": 2063 }, { "epoch": 3.3648516465601563, "grad_norm": 0.12701204419136047, "learning_rate": 0.0001, "loss": 0.1428, "step": 2064 }, { "epoch": 3.366481904140854, "grad_norm": 0.13955272734165192, "learning_rate": 0.0001, "loss": 0.1618, "step": 2065 }, { "epoch": 3.368112161721552, "grad_norm": 0.1396985650062561, "learning_rate": 0.0001, "loss": 0.1558, "step": 2066 }, { "epoch": 3.36974241930225, "grad_norm": 0.1466887891292572, "learning_rate": 0.0001, "loss": 0.1565, "step": 2067 }, { "epoch": 3.3713726768829475, "grad_norm": 0.148866206407547, "learning_rate": 0.0001, "loss": 0.1525, "step": 2068 }, { "epoch": 3.373002934463645, "grad_norm": 0.16650554537773132, "learning_rate": 0.0001, "loss": 0.1728, "step": 2069 }, { "epoch": 3.374633192044343, "grad_norm": 0.14089016616344452, "learning_rate": 0.0001, "loss": 0.1527, "step": 2070 }, { "epoch": 3.376263449625041, "grad_norm": 0.15425576269626617, "learning_rate": 0.0001, "loss": 0.1612, "step": 2071 }, { "epoch": 3.3778937072057387, "grad_norm": 0.2497289627790451, "learning_rate": 0.0001, "loss": 0.1661, "step": 2072 }, { "epoch": 3.3795239647864364, "grad_norm": 0.15727926790714264, "learning_rate": 0.0001, "loss": 0.1641, "step": 2073 }, { "epoch": 3.381154222367134, "grad_norm": 0.13486291468143463, "learning_rate": 0.0001, "loss": 0.1435, "step": 2074 }, { "epoch": 3.3827844799478317, "grad_norm": 0.13273045420646667, "learning_rate": 0.0001, "loss": 0.1599, "step": 2075 }, { "epoch": 3.3844147375285294, "grad_norm": 0.14401297271251678, "learning_rate": 0.0001, "loss": 0.162, "step": 2076 }, { "epoch": 3.386044995109227, "grad_norm": 0.1639464646577835, "learning_rate": 0.0001, "loss": 0.1749, "step": 2077 }, { "epoch": 3.387675252689925, "grad_norm": 0.14354901015758514, "learning_rate": 0.0001, "loss": 0.1571, "step": 2078 }, { "epoch": 3.389305510270623, "grad_norm": 0.1772812008857727, "learning_rate": 0.0001, "loss": 0.164, "step": 2079 }, { "epoch": 3.3909357678513206, "grad_norm": 0.1357707381248474, "learning_rate": 0.0001, "loss": 0.1572, "step": 2080 }, { "epoch": 3.3925660254320182, "grad_norm": 0.15610982477664948, "learning_rate": 0.0001, "loss": 0.1718, "step": 2081 }, { "epoch": 3.394196283012716, "grad_norm": 0.1462925523519516, "learning_rate": 0.0001, "loss": 0.1458, "step": 2082 }, { "epoch": 3.3958265405934136, "grad_norm": 0.1445331573486328, "learning_rate": 0.0001, "loss": 0.1614, "step": 2083 }, { "epoch": 3.3974567981741117, "grad_norm": 0.13011384010314941, "learning_rate": 0.0001, "loss": 0.1629, "step": 2084 }, { "epoch": 3.3990870557548094, "grad_norm": 0.13652506470680237, "learning_rate": 0.0001, "loss": 0.145, "step": 2085 }, { "epoch": 3.400717313335507, "grad_norm": 0.15374945104122162, "learning_rate": 0.0001, "loss": 0.1607, "step": 2086 }, { "epoch": 3.4023475709162048, "grad_norm": 0.17759419977664948, "learning_rate": 0.0001, "loss": 0.1831, "step": 2087 }, { "epoch": 3.4039778284969024, "grad_norm": 0.13576067984104156, "learning_rate": 0.0001, "loss": 0.1609, "step": 2088 }, { "epoch": 3.4056080860776, "grad_norm": 0.14758118987083435, "learning_rate": 0.0001, "loss": 0.1648, "step": 2089 }, { "epoch": 3.407238343658298, "grad_norm": 0.14185144007205963, "learning_rate": 0.0001, "loss": 0.1553, "step": 2090 }, { "epoch": 3.408868601238996, "grad_norm": 0.15567590296268463, "learning_rate": 0.0001, "loss": 0.162, "step": 2091 }, { "epoch": 3.4104988588196936, "grad_norm": 0.14208033680915833, "learning_rate": 0.0001, "loss": 0.1648, "step": 2092 }, { "epoch": 3.4121291164003913, "grad_norm": 0.150962695479393, "learning_rate": 0.0001, "loss": 0.1659, "step": 2093 }, { "epoch": 3.413759373981089, "grad_norm": 0.18408554792404175, "learning_rate": 0.0001, "loss": 0.1564, "step": 2094 }, { "epoch": 3.4153896315617867, "grad_norm": 0.1329030841588974, "learning_rate": 0.0001, "loss": 0.1434, "step": 2095 }, { "epoch": 3.4170198891424843, "grad_norm": 0.19760195910930634, "learning_rate": 0.0001, "loss": 0.1616, "step": 2096 }, { "epoch": 3.4186501467231825, "grad_norm": 0.13045673072338104, "learning_rate": 0.0001, "loss": 0.1482, "step": 2097 }, { "epoch": 3.42028040430388, "grad_norm": 0.13792401552200317, "learning_rate": 0.0001, "loss": 0.1491, "step": 2098 }, { "epoch": 3.421910661884578, "grad_norm": 0.15530133247375488, "learning_rate": 0.0001, "loss": 0.1569, "step": 2099 }, { "epoch": 3.4235409194652755, "grad_norm": 0.16320976614952087, "learning_rate": 0.0001, "loss": 0.1504, "step": 2100 }, { "epoch": 3.425171177045973, "grad_norm": 0.16057944297790527, "learning_rate": 0.0001, "loss": 0.1575, "step": 2101 }, { "epoch": 3.426801434626671, "grad_norm": 0.16377048194408417, "learning_rate": 0.0001, "loss": 0.1587, "step": 2102 }, { "epoch": 3.4284316922073685, "grad_norm": 0.14860613644123077, "learning_rate": 0.0001, "loss": 0.1666, "step": 2103 }, { "epoch": 3.4300619497880667, "grad_norm": 0.1717861294746399, "learning_rate": 0.0001, "loss": 0.1727, "step": 2104 }, { "epoch": 3.4316922073687643, "grad_norm": 0.18878856301307678, "learning_rate": 0.0001, "loss": 0.1744, "step": 2105 }, { "epoch": 3.433322464949462, "grad_norm": 0.22134870290756226, "learning_rate": 0.0001, "loss": 0.1502, "step": 2106 }, { "epoch": 3.4349527225301597, "grad_norm": 0.12796226143836975, "learning_rate": 0.0001, "loss": 0.1472, "step": 2107 }, { "epoch": 3.4365829801108574, "grad_norm": 0.17766544222831726, "learning_rate": 0.0001, "loss": 0.1631, "step": 2108 }, { "epoch": 3.4382132376915555, "grad_norm": 0.16302259266376495, "learning_rate": 0.0001, "loss": 0.154, "step": 2109 }, { "epoch": 3.439843495272253, "grad_norm": 0.12639376521110535, "learning_rate": 0.0001, "loss": 0.1595, "step": 2110 }, { "epoch": 3.441473752852951, "grad_norm": 0.13385580480098724, "learning_rate": 0.0001, "loss": 0.148, "step": 2111 }, { "epoch": 3.4431040104336486, "grad_norm": 0.16364267468452454, "learning_rate": 0.0001, "loss": 0.1521, "step": 2112 }, { "epoch": 3.4447342680143462, "grad_norm": 0.1307305246591568, "learning_rate": 0.0001, "loss": 0.1623, "step": 2113 }, { "epoch": 3.446364525595044, "grad_norm": 0.1305004358291626, "learning_rate": 0.0001, "loss": 0.1428, "step": 2114 }, { "epoch": 3.4479947831757416, "grad_norm": 0.1550890952348709, "learning_rate": 0.0001, "loss": 0.1521, "step": 2115 }, { "epoch": 3.4496250407564393, "grad_norm": 0.14810912311077118, "learning_rate": 0.0001, "loss": 0.1635, "step": 2116 }, { "epoch": 3.4512552983371374, "grad_norm": 0.15610523521900177, "learning_rate": 0.0001, "loss": 0.1619, "step": 2117 }, { "epoch": 3.452885555917835, "grad_norm": 0.1532841920852661, "learning_rate": 0.0001, "loss": 0.1609, "step": 2118 }, { "epoch": 3.4545158134985328, "grad_norm": 0.14741961658000946, "learning_rate": 0.0001, "loss": 0.167, "step": 2119 }, { "epoch": 3.4561460710792304, "grad_norm": 0.13078215718269348, "learning_rate": 0.0001, "loss": 0.1428, "step": 2120 }, { "epoch": 3.457776328659928, "grad_norm": 0.18994919955730438, "learning_rate": 0.0001, "loss": 0.1548, "step": 2121 }, { "epoch": 3.4594065862406262, "grad_norm": 0.12827709317207336, "learning_rate": 0.0001, "loss": 0.1586, "step": 2122 }, { "epoch": 3.461036843821324, "grad_norm": 0.15168218314647675, "learning_rate": 0.0001, "loss": 0.1641, "step": 2123 }, { "epoch": 3.4626671014020216, "grad_norm": 0.13048698008060455, "learning_rate": 0.0001, "loss": 0.1566, "step": 2124 }, { "epoch": 3.4642973589827193, "grad_norm": 0.13012981414794922, "learning_rate": 0.0001, "loss": 0.1523, "step": 2125 }, { "epoch": 3.465927616563417, "grad_norm": 0.14694590866565704, "learning_rate": 0.0001, "loss": 0.1473, "step": 2126 }, { "epoch": 3.4675578741441146, "grad_norm": 0.13174067437648773, "learning_rate": 0.0001, "loss": 0.1531, "step": 2127 }, { "epoch": 3.4691881317248123, "grad_norm": 0.18024425208568573, "learning_rate": 0.0001, "loss": 0.1624, "step": 2128 }, { "epoch": 3.4708183893055105, "grad_norm": 0.14632080495357513, "learning_rate": 0.0001, "loss": 0.1709, "step": 2129 }, { "epoch": 3.472448646886208, "grad_norm": 0.13840989768505096, "learning_rate": 0.0001, "loss": 0.1576, "step": 2130 }, { "epoch": 3.474078904466906, "grad_norm": 0.1256123036146164, "learning_rate": 0.0001, "loss": 0.1486, "step": 2131 }, { "epoch": 3.4757091620476035, "grad_norm": 0.19333600997924805, "learning_rate": 0.0001, "loss": 0.1526, "step": 2132 }, { "epoch": 3.477339419628301, "grad_norm": 0.1740863025188446, "learning_rate": 0.0001, "loss": 0.162, "step": 2133 }, { "epoch": 3.478969677208999, "grad_norm": 0.17544031143188477, "learning_rate": 0.0001, "loss": 0.1743, "step": 2134 }, { "epoch": 3.480599934789697, "grad_norm": 0.13726355135440826, "learning_rate": 0.0001, "loss": 0.146, "step": 2135 }, { "epoch": 3.4822301923703947, "grad_norm": 0.13387274742126465, "learning_rate": 0.0001, "loss": 0.1444, "step": 2136 }, { "epoch": 3.4838604499510923, "grad_norm": 0.1480950564146042, "learning_rate": 0.0001, "loss": 0.1656, "step": 2137 }, { "epoch": 3.48549070753179, "grad_norm": 0.14710885286331177, "learning_rate": 0.0001, "loss": 0.1501, "step": 2138 }, { "epoch": 3.4871209651124877, "grad_norm": 0.14011640846729279, "learning_rate": 0.0001, "loss": 0.1571, "step": 2139 }, { "epoch": 3.4887512226931854, "grad_norm": 0.15596067905426025, "learning_rate": 0.0001, "loss": 0.1686, "step": 2140 }, { "epoch": 3.490381480273883, "grad_norm": 0.14129222929477692, "learning_rate": 0.0001, "loss": 0.1597, "step": 2141 }, { "epoch": 3.492011737854581, "grad_norm": 0.1319301277399063, "learning_rate": 0.0001, "loss": 0.1619, "step": 2142 }, { "epoch": 3.493641995435279, "grad_norm": 0.16085407137870789, "learning_rate": 0.0001, "loss": 0.1542, "step": 2143 }, { "epoch": 3.4952722530159765, "grad_norm": 0.1328543722629547, "learning_rate": 0.0001, "loss": 0.1501, "step": 2144 }, { "epoch": 3.4969025105966742, "grad_norm": 0.14981688559055328, "learning_rate": 0.0001, "loss": 0.1601, "step": 2145 }, { "epoch": 3.498532768177372, "grad_norm": 0.14664553105831146, "learning_rate": 0.0001, "loss": 0.1583, "step": 2146 }, { "epoch": 3.50016302575807, "grad_norm": 0.13246874511241913, "learning_rate": 0.0001, "loss": 0.1491, "step": 2147 }, { "epoch": 3.5017932833387677, "grad_norm": 0.15695396065711975, "learning_rate": 0.0001, "loss": 0.1793, "step": 2148 }, { "epoch": 3.5034235409194654, "grad_norm": 0.12087570875883102, "learning_rate": 0.0001, "loss": 0.1396, "step": 2149 }, { "epoch": 3.505053798500163, "grad_norm": 0.13022390007972717, "learning_rate": 0.0001, "loss": 0.1577, "step": 2150 }, { "epoch": 3.5066840560808608, "grad_norm": 0.15572933852672577, "learning_rate": 0.0001, "loss": 0.1532, "step": 2151 }, { "epoch": 3.5083143136615584, "grad_norm": 0.17626436054706573, "learning_rate": 0.0001, "loss": 0.1604, "step": 2152 }, { "epoch": 3.509944571242256, "grad_norm": 0.14553359150886536, "learning_rate": 0.0001, "loss": 0.163, "step": 2153 }, { "epoch": 3.511574828822954, "grad_norm": 0.17382773756980896, "learning_rate": 0.0001, "loss": 0.1641, "step": 2154 }, { "epoch": 3.513205086403652, "grad_norm": 0.12575078010559082, "learning_rate": 0.0001, "loss": 0.1493, "step": 2155 }, { "epoch": 3.5148353439843496, "grad_norm": 0.1463012993335724, "learning_rate": 0.0001, "loss": 0.159, "step": 2156 }, { "epoch": 3.5164656015650473, "grad_norm": 0.16834314167499542, "learning_rate": 0.0001, "loss": 0.1659, "step": 2157 }, { "epoch": 3.518095859145745, "grad_norm": 0.15499548614025116, "learning_rate": 0.0001, "loss": 0.1584, "step": 2158 }, { "epoch": 3.5197261167264426, "grad_norm": 0.13314367830753326, "learning_rate": 0.0001, "loss": 0.1515, "step": 2159 }, { "epoch": 3.5213563743071408, "grad_norm": 0.1413407176733017, "learning_rate": 0.0001, "loss": 0.1614, "step": 2160 }, { "epoch": 3.5229866318878384, "grad_norm": 0.14445054531097412, "learning_rate": 0.0001, "loss": 0.1595, "step": 2161 }, { "epoch": 3.524616889468536, "grad_norm": 0.14501655101776123, "learning_rate": 0.0001, "loss": 0.1607, "step": 2162 }, { "epoch": 3.526247147049234, "grad_norm": 0.14677855372428894, "learning_rate": 0.0001, "loss": 0.1588, "step": 2163 }, { "epoch": 3.5278774046299315, "grad_norm": 0.13915497064590454, "learning_rate": 0.0001, "loss": 0.164, "step": 2164 }, { "epoch": 3.529507662210629, "grad_norm": 0.12632952630519867, "learning_rate": 0.0001, "loss": 0.1442, "step": 2165 }, { "epoch": 3.531137919791327, "grad_norm": 0.13267867267131805, "learning_rate": 0.0001, "loss": 0.159, "step": 2166 }, { "epoch": 3.5327681773720245, "grad_norm": 0.12538422644138336, "learning_rate": 0.0001, "loss": 0.1409, "step": 2167 }, { "epoch": 3.5343984349527227, "grad_norm": 0.13974052667617798, "learning_rate": 0.0001, "loss": 0.1461, "step": 2168 }, { "epoch": 3.5360286925334203, "grad_norm": 0.1575179398059845, "learning_rate": 0.0001, "loss": 0.1596, "step": 2169 }, { "epoch": 3.537658950114118, "grad_norm": 0.20956085622310638, "learning_rate": 0.0001, "loss": 0.1583, "step": 2170 }, { "epoch": 3.5392892076948157, "grad_norm": 0.13932162523269653, "learning_rate": 0.0001, "loss": 0.1493, "step": 2171 }, { "epoch": 3.5409194652755134, "grad_norm": 0.16261732578277588, "learning_rate": 0.0001, "loss": 0.1603, "step": 2172 }, { "epoch": 3.5425497228562115, "grad_norm": 0.15178941190242767, "learning_rate": 0.0001, "loss": 0.1601, "step": 2173 }, { "epoch": 3.544179980436909, "grad_norm": 0.17579567432403564, "learning_rate": 0.0001, "loss": 0.1531, "step": 2174 }, { "epoch": 3.545810238017607, "grad_norm": 0.1470760852098465, "learning_rate": 0.0001, "loss": 0.1498, "step": 2175 }, { "epoch": 3.5474404955983045, "grad_norm": 0.1457168012857437, "learning_rate": 0.0001, "loss": 0.1667, "step": 2176 }, { "epoch": 3.5490707531790022, "grad_norm": 0.1363757997751236, "learning_rate": 0.0001, "loss": 0.1538, "step": 2177 }, { "epoch": 3.5507010107597, "grad_norm": 0.15826475620269775, "learning_rate": 0.0001, "loss": 0.1643, "step": 2178 }, { "epoch": 3.5523312683403976, "grad_norm": 0.13432374596595764, "learning_rate": 0.0001, "loss": 0.1487, "step": 2179 }, { "epoch": 3.5539615259210953, "grad_norm": 0.1316751092672348, "learning_rate": 0.0001, "loss": 0.1515, "step": 2180 }, { "epoch": 3.5555917835017934, "grad_norm": 0.14341945946216583, "learning_rate": 0.0001, "loss": 0.1507, "step": 2181 }, { "epoch": 3.557222041082491, "grad_norm": 0.15055926144123077, "learning_rate": 0.0001, "loss": 0.164, "step": 2182 }, { "epoch": 3.5588522986631888, "grad_norm": 0.18501447141170502, "learning_rate": 0.0001, "loss": 0.1526, "step": 2183 }, { "epoch": 3.5604825562438864, "grad_norm": 0.13240905106067657, "learning_rate": 0.0001, "loss": 0.1487, "step": 2184 }, { "epoch": 3.5621128138245846, "grad_norm": 0.15811198949813843, "learning_rate": 0.0001, "loss": 0.1659, "step": 2185 }, { "epoch": 3.5637430714052822, "grad_norm": 0.17472024261951447, "learning_rate": 0.0001, "loss": 0.1605, "step": 2186 }, { "epoch": 3.56537332898598, "grad_norm": 0.1416904181241989, "learning_rate": 0.0001, "loss": 0.1563, "step": 2187 }, { "epoch": 3.5670035865666776, "grad_norm": 0.18020367622375488, "learning_rate": 0.0001, "loss": 0.1733, "step": 2188 }, { "epoch": 3.5686338441473753, "grad_norm": 0.1482088267803192, "learning_rate": 0.0001, "loss": 0.1619, "step": 2189 }, { "epoch": 3.570264101728073, "grad_norm": 0.2286827713251114, "learning_rate": 0.0001, "loss": 0.1745, "step": 2190 }, { "epoch": 3.5718943593087706, "grad_norm": 0.15454155206680298, "learning_rate": 0.0001, "loss": 0.1571, "step": 2191 }, { "epoch": 3.5735246168894683, "grad_norm": 0.15183959901332855, "learning_rate": 0.0001, "loss": 0.1604, "step": 2192 }, { "epoch": 3.5751548744701664, "grad_norm": 0.1492152214050293, "learning_rate": 0.0001, "loss": 0.1735, "step": 2193 }, { "epoch": 3.576785132050864, "grad_norm": 0.15264061093330383, "learning_rate": 0.0001, "loss": 0.1541, "step": 2194 }, { "epoch": 3.578415389631562, "grad_norm": 0.13549228012561798, "learning_rate": 0.0001, "loss": 0.1521, "step": 2195 }, { "epoch": 3.5800456472122595, "grad_norm": 0.1467984914779663, "learning_rate": 0.0001, "loss": 0.1607, "step": 2196 }, { "epoch": 3.581675904792957, "grad_norm": 0.13193894922733307, "learning_rate": 0.0001, "loss": 0.158, "step": 2197 }, { "epoch": 3.5833061623736553, "grad_norm": 0.1380847692489624, "learning_rate": 0.0001, "loss": 0.1567, "step": 2198 }, { "epoch": 3.584936419954353, "grad_norm": 0.12376043945550919, "learning_rate": 0.0001, "loss": 0.1642, "step": 2199 }, { "epoch": 3.5865666775350507, "grad_norm": 0.1520168036222458, "learning_rate": 0.0001, "loss": 0.1668, "step": 2200 }, { "epoch": 3.5881969351157483, "grad_norm": 0.15130695700645447, "learning_rate": 0.0001, "loss": 0.1607, "step": 2201 }, { "epoch": 3.589827192696446, "grad_norm": 0.21414104104042053, "learning_rate": 0.0001, "loss": 0.1748, "step": 2202 }, { "epoch": 3.5914574502771437, "grad_norm": 0.13339370489120483, "learning_rate": 0.0001, "loss": 0.1569, "step": 2203 }, { "epoch": 3.5930877078578414, "grad_norm": 0.14717626571655273, "learning_rate": 0.0001, "loss": 0.1485, "step": 2204 }, { "epoch": 3.594717965438539, "grad_norm": 0.14454171061515808, "learning_rate": 0.0001, "loss": 0.1561, "step": 2205 }, { "epoch": 3.596348223019237, "grad_norm": 0.1337466984987259, "learning_rate": 0.0001, "loss": 0.1548, "step": 2206 }, { "epoch": 3.597978480599935, "grad_norm": 0.13074032962322235, "learning_rate": 0.0001, "loss": 0.1602, "step": 2207 }, { "epoch": 3.5996087381806325, "grad_norm": 0.1627005934715271, "learning_rate": 0.0001, "loss": 0.1582, "step": 2208 }, { "epoch": 3.60123899576133, "grad_norm": 0.16543006896972656, "learning_rate": 0.0001, "loss": 0.1699, "step": 2209 }, { "epoch": 3.602869253342028, "grad_norm": 0.15218278765678406, "learning_rate": 0.0001, "loss": 0.164, "step": 2210 }, { "epoch": 3.604499510922726, "grad_norm": 0.15008680522441864, "learning_rate": 0.0001, "loss": 0.1567, "step": 2211 }, { "epoch": 3.6061297685034237, "grad_norm": 0.1402311772108078, "learning_rate": 0.0001, "loss": 0.149, "step": 2212 }, { "epoch": 3.6077600260841214, "grad_norm": 0.12671758234500885, "learning_rate": 0.0001, "loss": 0.1452, "step": 2213 }, { "epoch": 3.609390283664819, "grad_norm": 0.13454151153564453, "learning_rate": 0.0001, "loss": 0.1565, "step": 2214 }, { "epoch": 3.6110205412455167, "grad_norm": 0.13921399414539337, "learning_rate": 0.0001, "loss": 0.1686, "step": 2215 }, { "epoch": 3.6126507988262144, "grad_norm": 0.15764585137367249, "learning_rate": 0.0001, "loss": 0.1696, "step": 2216 }, { "epoch": 3.614281056406912, "grad_norm": 0.13245287537574768, "learning_rate": 0.0001, "loss": 0.1497, "step": 2217 }, { "epoch": 3.61591131398761, "grad_norm": 0.15295889973640442, "learning_rate": 0.0001, "loss": 0.1548, "step": 2218 }, { "epoch": 3.617541571568308, "grad_norm": 0.1387680172920227, "learning_rate": 0.0001, "loss": 0.1566, "step": 2219 }, { "epoch": 3.6191718291490056, "grad_norm": 0.16183353960514069, "learning_rate": 0.0001, "loss": 0.1713, "step": 2220 }, { "epoch": 3.6208020867297033, "grad_norm": 0.15998132526874542, "learning_rate": 0.0001, "loss": 0.1716, "step": 2221 }, { "epoch": 3.622432344310401, "grad_norm": 0.14948931336402893, "learning_rate": 0.0001, "loss": 0.1558, "step": 2222 }, { "epoch": 3.6240626018910986, "grad_norm": 0.13298356533050537, "learning_rate": 0.0001, "loss": 0.1497, "step": 2223 }, { "epoch": 3.6256928594717968, "grad_norm": 0.1536460816860199, "learning_rate": 0.0001, "loss": 0.1637, "step": 2224 }, { "epoch": 3.6273231170524944, "grad_norm": 0.13881844282150269, "learning_rate": 0.0001, "loss": 0.1637, "step": 2225 }, { "epoch": 3.628953374633192, "grad_norm": 0.14646394550800323, "learning_rate": 0.0001, "loss": 0.1608, "step": 2226 }, { "epoch": 3.63058363221389, "grad_norm": 0.16166602075099945, "learning_rate": 0.0001, "loss": 0.1587, "step": 2227 }, { "epoch": 3.6322138897945875, "grad_norm": 0.1409720927476883, "learning_rate": 0.0001, "loss": 0.1553, "step": 2228 }, { "epoch": 3.633844147375285, "grad_norm": 0.14227084815502167, "learning_rate": 0.0001, "loss": 0.164, "step": 2229 }, { "epoch": 3.635474404955983, "grad_norm": 0.13686221837997437, "learning_rate": 0.0001, "loss": 0.1513, "step": 2230 }, { "epoch": 3.6371046625366805, "grad_norm": 0.1413184404373169, "learning_rate": 0.0001, "loss": 0.1614, "step": 2231 }, { "epoch": 3.6387349201173786, "grad_norm": 0.15510554611682892, "learning_rate": 0.0001, "loss": 0.1584, "step": 2232 }, { "epoch": 3.6403651776980763, "grad_norm": 0.17040590941905975, "learning_rate": 0.0001, "loss": 0.1584, "step": 2233 }, { "epoch": 3.641995435278774, "grad_norm": 0.14824065566062927, "learning_rate": 0.0001, "loss": 0.1626, "step": 2234 }, { "epoch": 3.6436256928594717, "grad_norm": 0.13972316682338715, "learning_rate": 0.0001, "loss": 0.1615, "step": 2235 }, { "epoch": 3.64525595044017, "grad_norm": 0.13162623345851898, "learning_rate": 0.0001, "loss": 0.1767, "step": 2236 }, { "epoch": 3.6468862080208675, "grad_norm": 0.1307874321937561, "learning_rate": 0.0001, "loss": 0.1464, "step": 2237 }, { "epoch": 3.648516465601565, "grad_norm": 0.13576972484588623, "learning_rate": 0.0001, "loss": 0.1629, "step": 2238 }, { "epoch": 3.650146723182263, "grad_norm": 0.17965367436408997, "learning_rate": 0.0001, "loss": 0.1658, "step": 2239 }, { "epoch": 3.6517769807629605, "grad_norm": 0.1429440826177597, "learning_rate": 0.0001, "loss": 0.1532, "step": 2240 }, { "epoch": 3.653407238343658, "grad_norm": 0.1440856158733368, "learning_rate": 0.0001, "loss": 0.1483, "step": 2241 }, { "epoch": 3.655037495924356, "grad_norm": 0.15915343165397644, "learning_rate": 0.0001, "loss": 0.1551, "step": 2242 }, { "epoch": 3.6566677535050536, "grad_norm": 0.16080951690673828, "learning_rate": 0.0001, "loss": 0.1573, "step": 2243 }, { "epoch": 3.6582980110857517, "grad_norm": 0.166608989238739, "learning_rate": 0.0001, "loss": 0.1501, "step": 2244 }, { "epoch": 3.6599282686664494, "grad_norm": 0.16346345841884613, "learning_rate": 0.0001, "loss": 0.1637, "step": 2245 }, { "epoch": 3.661558526247147, "grad_norm": 0.1675470620393753, "learning_rate": 0.0001, "loss": 0.1635, "step": 2246 }, { "epoch": 3.6631887838278447, "grad_norm": 0.15890179574489594, "learning_rate": 0.0001, "loss": 0.1633, "step": 2247 }, { "epoch": 3.6648190414085424, "grad_norm": 0.1385503113269806, "learning_rate": 0.0001, "loss": 0.1517, "step": 2248 }, { "epoch": 3.6664492989892405, "grad_norm": 0.14586155116558075, "learning_rate": 0.0001, "loss": 0.1647, "step": 2249 }, { "epoch": 3.6680795565699382, "grad_norm": 0.15654848515987396, "learning_rate": 0.0001, "loss": 0.1782, "step": 2250 }, { "epoch": 3.669709814150636, "grad_norm": 0.14317664504051208, "learning_rate": 0.0001, "loss": 0.1527, "step": 2251 }, { "epoch": 3.6713400717313336, "grad_norm": 0.1403859406709671, "learning_rate": 0.0001, "loss": 0.1611, "step": 2252 }, { "epoch": 3.6729703293120313, "grad_norm": 0.16431018710136414, "learning_rate": 0.0001, "loss": 0.1633, "step": 2253 }, { "epoch": 3.674600586892729, "grad_norm": 0.14151932299137115, "learning_rate": 0.0001, "loss": 0.1465, "step": 2254 }, { "epoch": 3.6762308444734266, "grad_norm": 0.1487053781747818, "learning_rate": 0.0001, "loss": 0.1595, "step": 2255 }, { "epoch": 3.6778611020541243, "grad_norm": 0.15442033112049103, "learning_rate": 0.0001, "loss": 0.1564, "step": 2256 }, { "epoch": 3.6794913596348224, "grad_norm": 0.13494384288787842, "learning_rate": 0.0001, "loss": 0.1544, "step": 2257 }, { "epoch": 3.68112161721552, "grad_norm": 0.1306290328502655, "learning_rate": 0.0001, "loss": 0.1472, "step": 2258 }, { "epoch": 3.682751874796218, "grad_norm": 0.15780360996723175, "learning_rate": 0.0001, "loss": 0.1579, "step": 2259 }, { "epoch": 3.6843821323769155, "grad_norm": 0.14508086442947388, "learning_rate": 0.0001, "loss": 0.1634, "step": 2260 }, { "epoch": 3.686012389957613, "grad_norm": 0.21266472339630127, "learning_rate": 0.0001, "loss": 0.1639, "step": 2261 }, { "epoch": 3.6876426475383113, "grad_norm": 0.14544187486171722, "learning_rate": 0.0001, "loss": 0.1665, "step": 2262 }, { "epoch": 3.689272905119009, "grad_norm": 0.1321001499891281, "learning_rate": 0.0001, "loss": 0.1571, "step": 2263 }, { "epoch": 3.6909031626997066, "grad_norm": 0.15363967418670654, "learning_rate": 0.0001, "loss": 0.1452, "step": 2264 }, { "epoch": 3.6925334202804043, "grad_norm": 0.149633526802063, "learning_rate": 0.0001, "loss": 0.1603, "step": 2265 }, { "epoch": 3.694163677861102, "grad_norm": 0.15210863947868347, "learning_rate": 0.0001, "loss": 0.1528, "step": 2266 }, { "epoch": 3.6957939354417997, "grad_norm": 0.1460522711277008, "learning_rate": 0.0001, "loss": 0.1634, "step": 2267 }, { "epoch": 3.6974241930224974, "grad_norm": 0.15007948875427246, "learning_rate": 0.0001, "loss": 0.1468, "step": 2268 }, { "epoch": 3.699054450603195, "grad_norm": 0.15326817333698273, "learning_rate": 0.0001, "loss": 0.1586, "step": 2269 }, { "epoch": 3.700684708183893, "grad_norm": 0.14278486371040344, "learning_rate": 0.0001, "loss": 0.1694, "step": 2270 }, { "epoch": 3.702314965764591, "grad_norm": 0.14537227153778076, "learning_rate": 0.0001, "loss": 0.1559, "step": 2271 }, { "epoch": 3.7039452233452885, "grad_norm": 0.1516200751066208, "learning_rate": 0.0001, "loss": 0.1577, "step": 2272 }, { "epoch": 3.705575480925986, "grad_norm": 0.14554426074028015, "learning_rate": 0.0001, "loss": 0.1645, "step": 2273 }, { "epoch": 3.707205738506684, "grad_norm": 0.1325369030237198, "learning_rate": 0.0001, "loss": 0.1614, "step": 2274 }, { "epoch": 3.708835996087382, "grad_norm": 0.1257917732000351, "learning_rate": 0.0001, "loss": 0.1445, "step": 2275 }, { "epoch": 3.7104662536680797, "grad_norm": 0.1275888830423355, "learning_rate": 0.0001, "loss": 0.16, "step": 2276 }, { "epoch": 3.7120965112487774, "grad_norm": 0.14174294471740723, "learning_rate": 0.0001, "loss": 0.1551, "step": 2277 }, { "epoch": 3.713726768829475, "grad_norm": 0.16611164808273315, "learning_rate": 0.0001, "loss": 0.1635, "step": 2278 }, { "epoch": 3.7153570264101727, "grad_norm": 0.1329105794429779, "learning_rate": 0.0001, "loss": 0.153, "step": 2279 }, { "epoch": 3.7169872839908704, "grad_norm": 0.13968057930469513, "learning_rate": 0.0001, "loss": 0.158, "step": 2280 }, { "epoch": 3.718617541571568, "grad_norm": 0.1441374272108078, "learning_rate": 0.0001, "loss": 0.1452, "step": 2281 }, { "epoch": 3.720247799152266, "grad_norm": 0.14315244555473328, "learning_rate": 0.0001, "loss": 0.1561, "step": 2282 }, { "epoch": 3.721878056732964, "grad_norm": 0.12441976368427277, "learning_rate": 0.0001, "loss": 0.1544, "step": 2283 }, { "epoch": 3.7235083143136616, "grad_norm": 0.14892461895942688, "learning_rate": 0.0001, "loss": 0.1622, "step": 2284 }, { "epoch": 3.7251385718943593, "grad_norm": 0.11621859669685364, "learning_rate": 0.0001, "loss": 0.1441, "step": 2285 }, { "epoch": 3.726768829475057, "grad_norm": 0.1611883044242859, "learning_rate": 0.0001, "loss": 0.1531, "step": 2286 }, { "epoch": 3.728399087055755, "grad_norm": 0.15095670521259308, "learning_rate": 0.0001, "loss": 0.1671, "step": 2287 }, { "epoch": 3.7300293446364527, "grad_norm": 0.1477982997894287, "learning_rate": 0.0001, "loss": 0.1529, "step": 2288 }, { "epoch": 3.7316596022171504, "grad_norm": 0.17801976203918457, "learning_rate": 0.0001, "loss": 0.1698, "step": 2289 }, { "epoch": 3.733289859797848, "grad_norm": 0.12449807673692703, "learning_rate": 0.0001, "loss": 0.1411, "step": 2290 }, { "epoch": 3.734920117378546, "grad_norm": 0.14423514902591705, "learning_rate": 0.0001, "loss": 0.1507, "step": 2291 }, { "epoch": 3.7365503749592435, "grad_norm": 0.15219520032405853, "learning_rate": 0.0001, "loss": 0.1746, "step": 2292 }, { "epoch": 3.738180632539941, "grad_norm": 0.13563816249370575, "learning_rate": 0.0001, "loss": 0.1446, "step": 2293 }, { "epoch": 3.739810890120639, "grad_norm": 0.13731209933757782, "learning_rate": 0.0001, "loss": 0.1467, "step": 2294 }, { "epoch": 3.741441147701337, "grad_norm": 0.14237910509109497, "learning_rate": 0.0001, "loss": 0.1635, "step": 2295 }, { "epoch": 3.7430714052820346, "grad_norm": 0.13505598902702332, "learning_rate": 0.0001, "loss": 0.148, "step": 2296 }, { "epoch": 3.7447016628627323, "grad_norm": 0.17628848552703857, "learning_rate": 0.0001, "loss": 0.1539, "step": 2297 }, { "epoch": 3.74633192044343, "grad_norm": 0.14530231058597565, "learning_rate": 0.0001, "loss": 0.1572, "step": 2298 }, { "epoch": 3.7479621780241277, "grad_norm": 0.14856889843940735, "learning_rate": 0.0001, "loss": 0.1623, "step": 2299 }, { "epoch": 3.749592435604826, "grad_norm": 0.170349583029747, "learning_rate": 0.0001, "loss": 0.1598, "step": 2300 }, { "epoch": 3.7512226931855235, "grad_norm": 0.14849968254566193, "learning_rate": 0.0001, "loss": 0.1751, "step": 2301 }, { "epoch": 3.752852950766221, "grad_norm": 0.1442185491323471, "learning_rate": 0.0001, "loss": 0.1454, "step": 2302 }, { "epoch": 3.754483208346919, "grad_norm": 0.14942088723182678, "learning_rate": 0.0001, "loss": 0.151, "step": 2303 }, { "epoch": 3.7561134659276165, "grad_norm": 0.15279610455036163, "learning_rate": 0.0001, "loss": 0.1574, "step": 2304 }, { "epoch": 3.757743723508314, "grad_norm": 0.1706325262784958, "learning_rate": 0.0001, "loss": 0.1537, "step": 2305 }, { "epoch": 3.759373981089012, "grad_norm": 0.15288381278514862, "learning_rate": 0.0001, "loss": 0.1618, "step": 2306 }, { "epoch": 3.7610042386697096, "grad_norm": 0.1183413565158844, "learning_rate": 0.0001, "loss": 0.1478, "step": 2307 }, { "epoch": 3.7626344962504077, "grad_norm": 0.13402055203914642, "learning_rate": 0.0001, "loss": 0.1464, "step": 2308 }, { "epoch": 3.7642647538311054, "grad_norm": 0.14990192651748657, "learning_rate": 0.0001, "loss": 0.1568, "step": 2309 }, { "epoch": 3.765895011411803, "grad_norm": 0.15893998742103577, "learning_rate": 0.0001, "loss": 0.1541, "step": 2310 }, { "epoch": 3.7675252689925007, "grad_norm": 0.15546587109565735, "learning_rate": 0.0001, "loss": 0.1541, "step": 2311 }, { "epoch": 3.7691555265731984, "grad_norm": 0.16592715680599213, "learning_rate": 0.0001, "loss": 0.1632, "step": 2312 }, { "epoch": 3.7707857841538965, "grad_norm": 0.15784431993961334, "learning_rate": 0.0001, "loss": 0.1627, "step": 2313 }, { "epoch": 3.772416041734594, "grad_norm": 0.16422522068023682, "learning_rate": 0.0001, "loss": 0.1589, "step": 2314 }, { "epoch": 3.774046299315292, "grad_norm": 0.14868198335170746, "learning_rate": 0.0001, "loss": 0.168, "step": 2315 }, { "epoch": 3.7756765568959896, "grad_norm": 0.1566770076751709, "learning_rate": 0.0001, "loss": 0.1657, "step": 2316 }, { "epoch": 3.7773068144766873, "grad_norm": 0.16978543996810913, "learning_rate": 0.0001, "loss": 0.1442, "step": 2317 }, { "epoch": 3.778937072057385, "grad_norm": 0.15802276134490967, "learning_rate": 0.0001, "loss": 0.1564, "step": 2318 }, { "epoch": 3.7805673296380826, "grad_norm": 0.15518346428871155, "learning_rate": 0.0001, "loss": 0.156, "step": 2319 }, { "epoch": 3.7821975872187803, "grad_norm": 0.1578715294599533, "learning_rate": 0.0001, "loss": 0.1687, "step": 2320 }, { "epoch": 3.7838278447994784, "grad_norm": 0.14465771615505219, "learning_rate": 0.0001, "loss": 0.1474, "step": 2321 }, { "epoch": 3.785458102380176, "grad_norm": 0.1761205941438675, "learning_rate": 0.0001, "loss": 0.1566, "step": 2322 }, { "epoch": 3.787088359960874, "grad_norm": 0.1283109188079834, "learning_rate": 0.0001, "loss": 0.1473, "step": 2323 }, { "epoch": 3.7887186175415715, "grad_norm": 0.14071565866470337, "learning_rate": 0.0001, "loss": 0.151, "step": 2324 }, { "epoch": 3.7903488751222696, "grad_norm": 0.17781133949756622, "learning_rate": 0.0001, "loss": 0.1635, "step": 2325 }, { "epoch": 3.7919791327029673, "grad_norm": 0.1420576274394989, "learning_rate": 0.0001, "loss": 0.1668, "step": 2326 }, { "epoch": 3.793609390283665, "grad_norm": 0.1406291425228119, "learning_rate": 0.0001, "loss": 0.1454, "step": 2327 }, { "epoch": 3.7952396478643626, "grad_norm": 0.14093580842018127, "learning_rate": 0.0001, "loss": 0.1495, "step": 2328 }, { "epoch": 3.7968699054450603, "grad_norm": 0.12997625768184662, "learning_rate": 0.0001, "loss": 0.1512, "step": 2329 }, { "epoch": 3.798500163025758, "grad_norm": 0.1375696063041687, "learning_rate": 0.0001, "loss": 0.1515, "step": 2330 }, { "epoch": 3.8001304206064557, "grad_norm": 0.15136592090129852, "learning_rate": 0.0001, "loss": 0.1583, "step": 2331 }, { "epoch": 3.8017606781871534, "grad_norm": 0.1388722062110901, "learning_rate": 0.0001, "loss": 0.1516, "step": 2332 }, { "epoch": 3.8033909357678515, "grad_norm": 0.15683171153068542, "learning_rate": 0.0001, "loss": 0.1726, "step": 2333 }, { "epoch": 3.805021193348549, "grad_norm": 0.13569128513336182, "learning_rate": 0.0001, "loss": 0.1585, "step": 2334 }, { "epoch": 3.806651450929247, "grad_norm": 0.14862379431724548, "learning_rate": 0.0001, "loss": 0.1553, "step": 2335 }, { "epoch": 3.8082817085099445, "grad_norm": 0.14512869715690613, "learning_rate": 0.0001, "loss": 0.1635, "step": 2336 }, { "epoch": 3.809911966090642, "grad_norm": 0.13408806920051575, "learning_rate": 0.0001, "loss": 0.1732, "step": 2337 }, { "epoch": 3.8115422236713403, "grad_norm": 0.1384328156709671, "learning_rate": 0.0001, "loss": 0.1631, "step": 2338 }, { "epoch": 3.813172481252038, "grad_norm": 0.1299862563610077, "learning_rate": 0.0001, "loss": 0.16, "step": 2339 }, { "epoch": 3.8148027388327357, "grad_norm": 0.13506759703159332, "learning_rate": 0.0001, "loss": 0.1719, "step": 2340 }, { "epoch": 3.8164329964134334, "grad_norm": 0.1495598405599594, "learning_rate": 0.0001, "loss": 0.1501, "step": 2341 }, { "epoch": 3.818063253994131, "grad_norm": 0.1325817108154297, "learning_rate": 0.0001, "loss": 0.1525, "step": 2342 }, { "epoch": 3.8196935115748287, "grad_norm": 0.13198593258857727, "learning_rate": 0.0001, "loss": 0.1731, "step": 2343 }, { "epoch": 3.8213237691555264, "grad_norm": 0.1505766659975052, "learning_rate": 0.0001, "loss": 0.1608, "step": 2344 }, { "epoch": 3.822954026736224, "grad_norm": 0.13334894180297852, "learning_rate": 0.0001, "loss": 0.1497, "step": 2345 }, { "epoch": 3.824584284316922, "grad_norm": 0.15418575704097748, "learning_rate": 0.0001, "loss": 0.1588, "step": 2346 }, { "epoch": 3.82621454189762, "grad_norm": 0.1457456350326538, "learning_rate": 0.0001, "loss": 0.1482, "step": 2347 }, { "epoch": 3.8278447994783176, "grad_norm": 0.20289213955402374, "learning_rate": 0.0001, "loss": 0.1491, "step": 2348 }, { "epoch": 3.8294750570590153, "grad_norm": 0.15711665153503418, "learning_rate": 0.0001, "loss": 0.16, "step": 2349 }, { "epoch": 3.831105314639713, "grad_norm": 0.17610590159893036, "learning_rate": 0.0001, "loss": 0.1741, "step": 2350 }, { "epoch": 3.832735572220411, "grad_norm": 0.15204782783985138, "learning_rate": 0.0001, "loss": 0.1443, "step": 2351 }, { "epoch": 3.8343658298011087, "grad_norm": 0.18176928162574768, "learning_rate": 0.0001, "loss": 0.1685, "step": 2352 }, { "epoch": 3.8359960873818064, "grad_norm": 0.16083329916000366, "learning_rate": 0.0001, "loss": 0.1564, "step": 2353 }, { "epoch": 3.837626344962504, "grad_norm": 0.1513577103614807, "learning_rate": 0.0001, "loss": 0.1556, "step": 2354 }, { "epoch": 3.839256602543202, "grad_norm": 0.15226048231124878, "learning_rate": 0.0001, "loss": 0.1565, "step": 2355 }, { "epoch": 3.8408868601238995, "grad_norm": 0.16025280952453613, "learning_rate": 0.0001, "loss": 0.1512, "step": 2356 }, { "epoch": 3.842517117704597, "grad_norm": 0.16616910696029663, "learning_rate": 0.0001, "loss": 0.1675, "step": 2357 }, { "epoch": 3.844147375285295, "grad_norm": 0.14858278632164001, "learning_rate": 0.0001, "loss": 0.1332, "step": 2358 }, { "epoch": 3.845777632865993, "grad_norm": 0.15278182923793793, "learning_rate": 0.0001, "loss": 0.154, "step": 2359 }, { "epoch": 3.8474078904466906, "grad_norm": 0.15966159105300903, "learning_rate": 0.0001, "loss": 0.1541, "step": 2360 }, { "epoch": 3.8490381480273883, "grad_norm": 0.12534458935260773, "learning_rate": 0.0001, "loss": 0.1547, "step": 2361 }, { "epoch": 3.850668405608086, "grad_norm": 0.14975754916667938, "learning_rate": 0.0001, "loss": 0.179, "step": 2362 }, { "epoch": 3.8522986631887837, "grad_norm": 0.130054771900177, "learning_rate": 0.0001, "loss": 0.1596, "step": 2363 }, { "epoch": 3.853928920769482, "grad_norm": 0.1336638629436493, "learning_rate": 0.0001, "loss": 0.1566, "step": 2364 }, { "epoch": 3.8555591783501795, "grad_norm": 0.14911198616027832, "learning_rate": 0.0001, "loss": 0.1617, "step": 2365 }, { "epoch": 3.857189435930877, "grad_norm": 0.1424766629934311, "learning_rate": 0.0001, "loss": 0.164, "step": 2366 }, { "epoch": 3.858819693511575, "grad_norm": 0.13621950149536133, "learning_rate": 0.0001, "loss": 0.1584, "step": 2367 }, { "epoch": 3.8604499510922725, "grad_norm": 0.13983507454395294, "learning_rate": 0.0001, "loss": 0.1441, "step": 2368 }, { "epoch": 3.86208020867297, "grad_norm": 0.14439599215984344, "learning_rate": 0.0001, "loss": 0.1568, "step": 2369 }, { "epoch": 3.863710466253668, "grad_norm": 0.14129005372524261, "learning_rate": 0.0001, "loss": 0.1675, "step": 2370 }, { "epoch": 3.8653407238343656, "grad_norm": 0.14672988653182983, "learning_rate": 0.0001, "loss": 0.1706, "step": 2371 }, { "epoch": 3.8669709814150637, "grad_norm": 0.14792685210704803, "learning_rate": 0.0001, "loss": 0.1692, "step": 2372 }, { "epoch": 3.8686012389957614, "grad_norm": 0.1503150761127472, "learning_rate": 0.0001, "loss": 0.1619, "step": 2373 }, { "epoch": 3.870231496576459, "grad_norm": 0.15507511794567108, "learning_rate": 0.0001, "loss": 0.1681, "step": 2374 }, { "epoch": 3.8718617541571567, "grad_norm": 0.1679387390613556, "learning_rate": 0.0001, "loss": 0.1451, "step": 2375 }, { "epoch": 3.873492011737855, "grad_norm": 0.15510492026805878, "learning_rate": 0.0001, "loss": 0.1719, "step": 2376 }, { "epoch": 3.8751222693185525, "grad_norm": 0.13814286887645721, "learning_rate": 0.0001, "loss": 0.1529, "step": 2377 }, { "epoch": 3.87675252689925, "grad_norm": 0.12971526384353638, "learning_rate": 0.0001, "loss": 0.1413, "step": 2378 }, { "epoch": 3.878382784479948, "grad_norm": 0.13721339404582977, "learning_rate": 0.0001, "loss": 0.1527, "step": 2379 }, { "epoch": 3.8800130420606456, "grad_norm": 0.1391274333000183, "learning_rate": 0.0001, "loss": 0.1544, "step": 2380 }, { "epoch": 3.8816432996413432, "grad_norm": 0.12956668436527252, "learning_rate": 0.0001, "loss": 0.1391, "step": 2381 }, { "epoch": 3.883273557222041, "grad_norm": 0.12296263873577118, "learning_rate": 0.0001, "loss": 0.1668, "step": 2382 }, { "epoch": 3.8849038148027386, "grad_norm": 0.14773519337177277, "learning_rate": 0.0001, "loss": 0.1704, "step": 2383 }, { "epoch": 3.8865340723834367, "grad_norm": 0.14175285398960114, "learning_rate": 0.0001, "loss": 0.1571, "step": 2384 }, { "epoch": 3.8881643299641344, "grad_norm": 0.1267097443342209, "learning_rate": 0.0001, "loss": 0.1431, "step": 2385 }, { "epoch": 3.889794587544832, "grad_norm": 0.29268741607666016, "learning_rate": 0.0001, "loss": 0.155, "step": 2386 }, { "epoch": 3.8914248451255298, "grad_norm": 0.13753896951675415, "learning_rate": 0.0001, "loss": 0.1561, "step": 2387 }, { "epoch": 3.8930551027062275, "grad_norm": 0.1538880318403244, "learning_rate": 0.0001, "loss": 0.1545, "step": 2388 }, { "epoch": 3.8946853602869256, "grad_norm": 0.15996767580509186, "learning_rate": 0.0001, "loss": 0.1551, "step": 2389 }, { "epoch": 3.8963156178676233, "grad_norm": 0.1439359188079834, "learning_rate": 0.0001, "loss": 0.1504, "step": 2390 }, { "epoch": 3.897945875448321, "grad_norm": 0.23938068747520447, "learning_rate": 0.0001, "loss": 0.1591, "step": 2391 }, { "epoch": 3.8995761330290186, "grad_norm": 0.1339603215456009, "learning_rate": 0.0001, "loss": 0.1541, "step": 2392 }, { "epoch": 3.9012063906097163, "grad_norm": 0.146190345287323, "learning_rate": 0.0001, "loss": 0.1639, "step": 2393 }, { "epoch": 3.902836648190414, "grad_norm": 0.14381268620491028, "learning_rate": 0.0001, "loss": 0.1553, "step": 2394 }, { "epoch": 3.9044669057711117, "grad_norm": 0.1367824673652649, "learning_rate": 0.0001, "loss": 0.1551, "step": 2395 }, { "epoch": 3.9060971633518093, "grad_norm": 0.15872205793857574, "learning_rate": 0.0001, "loss": 0.1591, "step": 2396 }, { "epoch": 3.9077274209325075, "grad_norm": 0.12379555404186249, "learning_rate": 0.0001, "loss": 0.1388, "step": 2397 }, { "epoch": 3.909357678513205, "grad_norm": 0.14201846718788147, "learning_rate": 0.0001, "loss": 0.1549, "step": 2398 }, { "epoch": 3.910987936093903, "grad_norm": 0.1701783388853073, "learning_rate": 0.0001, "loss": 0.1743, "step": 2399 }, { "epoch": 3.9126181936746005, "grad_norm": 0.13417696952819824, "learning_rate": 0.0001, "loss": 0.1526, "step": 2400 }, { "epoch": 3.914248451255298, "grad_norm": 0.1509416103363037, "learning_rate": 0.0001, "loss": 0.1607, "step": 2401 }, { "epoch": 3.9158787088359963, "grad_norm": 0.1474318504333496, "learning_rate": 0.0001, "loss": 0.1509, "step": 2402 }, { "epoch": 3.917508966416694, "grad_norm": 0.18277962505817413, "learning_rate": 0.0001, "loss": 0.1673, "step": 2403 }, { "epoch": 3.9191392239973917, "grad_norm": 0.14905720949172974, "learning_rate": 0.0001, "loss": 0.1513, "step": 2404 }, { "epoch": 3.9207694815780894, "grad_norm": 0.13981392979621887, "learning_rate": 0.0001, "loss": 0.1615, "step": 2405 }, { "epoch": 3.922399739158787, "grad_norm": 0.13442060351371765, "learning_rate": 0.0001, "loss": 0.1514, "step": 2406 }, { "epoch": 3.9240299967394847, "grad_norm": 0.13714581727981567, "learning_rate": 0.0001, "loss": 0.1563, "step": 2407 }, { "epoch": 3.9256602543201824, "grad_norm": 0.15870222449302673, "learning_rate": 0.0001, "loss": 0.166, "step": 2408 }, { "epoch": 3.92729051190088, "grad_norm": 0.14761915802955627, "learning_rate": 0.0001, "loss": 0.1587, "step": 2409 }, { "epoch": 3.928920769481578, "grad_norm": 0.13917234539985657, "learning_rate": 0.0001, "loss": 0.1609, "step": 2410 }, { "epoch": 3.930551027062276, "grad_norm": 0.13690310716629028, "learning_rate": 0.0001, "loss": 0.1478, "step": 2411 }, { "epoch": 3.9321812846429736, "grad_norm": 0.1417568325996399, "learning_rate": 0.0001, "loss": 0.1648, "step": 2412 }, { "epoch": 3.9338115422236712, "grad_norm": 0.13829518854618073, "learning_rate": 0.0001, "loss": 0.1589, "step": 2413 }, { "epoch": 3.9354417998043694, "grad_norm": 0.11972015351057053, "learning_rate": 0.0001, "loss": 0.125, "step": 2414 }, { "epoch": 3.937072057385067, "grad_norm": 0.13998998701572418, "learning_rate": 0.0001, "loss": 0.1721, "step": 2415 }, { "epoch": 3.9387023149657647, "grad_norm": 0.13850805163383484, "learning_rate": 0.0001, "loss": 0.1451, "step": 2416 }, { "epoch": 3.9403325725464624, "grad_norm": 0.13838806748390198, "learning_rate": 0.0001, "loss": 0.1509, "step": 2417 }, { "epoch": 3.94196283012716, "grad_norm": 0.13861410319805145, "learning_rate": 0.0001, "loss": 0.1512, "step": 2418 }, { "epoch": 3.9435930877078578, "grad_norm": 0.14857284724712372, "learning_rate": 0.0001, "loss": 0.145, "step": 2419 }, { "epoch": 3.9452233452885554, "grad_norm": 0.13676993548870087, "learning_rate": 0.0001, "loss": 0.1464, "step": 2420 }, { "epoch": 3.946853602869253, "grad_norm": 0.1672215759754181, "learning_rate": 0.0001, "loss": 0.1506, "step": 2421 }, { "epoch": 3.9484838604499513, "grad_norm": 0.1676868051290512, "learning_rate": 0.0001, "loss": 0.1714, "step": 2422 }, { "epoch": 3.950114118030649, "grad_norm": 0.15942254662513733, "learning_rate": 0.0001, "loss": 0.1543, "step": 2423 }, { "epoch": 3.9517443756113466, "grad_norm": 0.1640825718641281, "learning_rate": 0.0001, "loss": 0.1555, "step": 2424 }, { "epoch": 3.9533746331920443, "grad_norm": 0.13515083491802216, "learning_rate": 0.0001, "loss": 0.1591, "step": 2425 }, { "epoch": 3.955004890772742, "grad_norm": 0.13103432953357697, "learning_rate": 0.0001, "loss": 0.1465, "step": 2426 }, { "epoch": 3.95663514835344, "grad_norm": 0.13040922582149506, "learning_rate": 0.0001, "loss": 0.1504, "step": 2427 }, { "epoch": 3.958265405934138, "grad_norm": 0.14233194291591644, "learning_rate": 0.0001, "loss": 0.1623, "step": 2428 }, { "epoch": 3.9598956635148355, "grad_norm": 0.14986193180084229, "learning_rate": 0.0001, "loss": 0.154, "step": 2429 }, { "epoch": 3.961525921095533, "grad_norm": 0.14771242439746857, "learning_rate": 0.0001, "loss": 0.1595, "step": 2430 }, { "epoch": 3.963156178676231, "grad_norm": 0.12477671355009079, "learning_rate": 0.0001, "loss": 0.1552, "step": 2431 }, { "epoch": 3.9647864362569285, "grad_norm": 0.12812167406082153, "learning_rate": 0.0001, "loss": 0.1576, "step": 2432 }, { "epoch": 3.966416693837626, "grad_norm": 0.12061511725187302, "learning_rate": 0.0001, "loss": 0.1474, "step": 2433 }, { "epoch": 3.968046951418324, "grad_norm": 0.15570174157619476, "learning_rate": 0.0001, "loss": 0.1528, "step": 2434 }, { "epoch": 3.969677208999022, "grad_norm": 0.12087506800889969, "learning_rate": 0.0001, "loss": 0.1376, "step": 2435 }, { "epoch": 3.9713074665797197, "grad_norm": 0.16076244413852692, "learning_rate": 0.0001, "loss": 0.159, "step": 2436 }, { "epoch": 3.9729377241604173, "grad_norm": 0.17178180813789368, "learning_rate": 0.0001, "loss": 0.1613, "step": 2437 }, { "epoch": 3.974567981741115, "grad_norm": 0.15928059816360474, "learning_rate": 0.0001, "loss": 0.1714, "step": 2438 }, { "epoch": 3.9761982393218127, "grad_norm": 0.14910367131233215, "learning_rate": 0.0001, "loss": 0.1522, "step": 2439 }, { "epoch": 3.977828496902511, "grad_norm": 0.158916637301445, "learning_rate": 0.0001, "loss": 0.1715, "step": 2440 }, { "epoch": 3.9794587544832085, "grad_norm": 0.13345277309417725, "learning_rate": 0.0001, "loss": 0.1463, "step": 2441 }, { "epoch": 3.981089012063906, "grad_norm": 0.1387554258108139, "learning_rate": 0.0001, "loss": 0.1705, "step": 2442 }, { "epoch": 3.982719269644604, "grad_norm": 0.1317049264907837, "learning_rate": 0.0001, "loss": 0.1444, "step": 2443 }, { "epoch": 3.9843495272253016, "grad_norm": 0.1379990130662918, "learning_rate": 0.0001, "loss": 0.1636, "step": 2444 }, { "epoch": 3.9859797848059992, "grad_norm": 0.14041666686534882, "learning_rate": 0.0001, "loss": 0.1543, "step": 2445 }, { "epoch": 3.987610042386697, "grad_norm": 0.1239699274301529, "learning_rate": 0.0001, "loss": 0.1477, "step": 2446 }, { "epoch": 3.9892402999673946, "grad_norm": 0.16149182617664337, "learning_rate": 0.0001, "loss": 0.1662, "step": 2447 }, { "epoch": 3.9908705575480927, "grad_norm": 0.13410444557666779, "learning_rate": 0.0001, "loss": 0.1662, "step": 2448 }, { "epoch": 3.9925008151287904, "grad_norm": 0.13568072021007538, "learning_rate": 0.0001, "loss": 0.1534, "step": 2449 }, { "epoch": 3.994131072709488, "grad_norm": 0.15375611186027527, "learning_rate": 0.0001, "loss": 0.1758, "step": 2450 }, { "epoch": 3.9957613302901858, "grad_norm": 0.13414491713047028, "learning_rate": 0.0001, "loss": 0.1674, "step": 2451 }, { "epoch": 3.9973915878708834, "grad_norm": 0.15805280208587646, "learning_rate": 0.0001, "loss": 0.1665, "step": 2452 }, { "epoch": 3.9973915878708834, "step": 2452, "total_flos": 7.122222383569568e+18, "train_loss": 0.188311614755535, "train_runtime": 156391.4769, "train_samples_per_second": 0.471, "train_steps_per_second": 0.016 } ], "logging_steps": 1.0, "max_steps": 2452, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 7.122222383569568e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }