{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99992916341999, "eval_steps": 500, "global_step": 3529, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.6118285147712355, "learning_rate": 1.8867924528301887e-08, "loss": 0.846, "step": 1 }, { "epoch": 0.0, "grad_norm": 5.713571723217765, "learning_rate": 3.7735849056603774e-08, "loss": 0.813, "step": 2 }, { "epoch": 0.0, "grad_norm": 6.189090904191137, "learning_rate": 5.660377358490566e-08, "loss": 0.8901, "step": 3 }, { "epoch": 0.0, "grad_norm": 7.135987059921496, "learning_rate": 7.547169811320755e-08, "loss": 0.9084, "step": 4 }, { "epoch": 0.0, "grad_norm": 6.9487575503764125, "learning_rate": 9.433962264150943e-08, "loss": 0.868, "step": 5 }, { "epoch": 0.0, "grad_norm": 6.531405547529674, "learning_rate": 1.1320754716981131e-07, "loss": 0.8339, "step": 6 }, { "epoch": 0.0, "grad_norm": 5.794428405177802, "learning_rate": 1.320754716981132e-07, "loss": 0.8867, "step": 7 }, { "epoch": 0.0, "grad_norm": 6.383676757195207, "learning_rate": 1.509433962264151e-07, "loss": 0.8464, "step": 8 }, { "epoch": 0.0, "grad_norm": 6.3067158349421915, "learning_rate": 1.6981132075471695e-07, "loss": 0.876, "step": 9 }, { "epoch": 0.0, "grad_norm": 5.879065954925436, "learning_rate": 1.8867924528301886e-07, "loss": 0.8613, "step": 10 }, { "epoch": 0.0, "grad_norm": 7.289965054557051, "learning_rate": 2.0754716981132074e-07, "loss": 0.8581, "step": 11 }, { "epoch": 0.0, "grad_norm": 6.250628393462324, "learning_rate": 2.2641509433962263e-07, "loss": 0.8659, "step": 12 }, { "epoch": 0.0, "grad_norm": 4.897669106714332, "learning_rate": 2.452830188679245e-07, "loss": 0.8234, "step": 13 }, { "epoch": 0.0, "grad_norm": 6.899340907296569, "learning_rate": 2.641509433962264e-07, "loss": 0.9095, "step": 14 }, { "epoch": 0.0, "grad_norm": 6.991040839353428, "learning_rate": 2.830188679245283e-07, "loss": 0.8476, "step": 15 }, { "epoch": 0.0, "grad_norm": 5.18007982067284, "learning_rate": 3.018867924528302e-07, "loss": 0.7656, "step": 16 }, { "epoch": 0.0, "grad_norm": 4.900634108194426, "learning_rate": 3.2075471698113204e-07, "loss": 0.797, "step": 17 }, { "epoch": 0.01, "grad_norm": 5.70859024316582, "learning_rate": 3.396226415094339e-07, "loss": 0.8428, "step": 18 }, { "epoch": 0.01, "grad_norm": 5.980094728579612, "learning_rate": 3.584905660377358e-07, "loss": 0.8573, "step": 19 }, { "epoch": 0.01, "grad_norm": 5.852389477273713, "learning_rate": 3.773584905660377e-07, "loss": 0.7921, "step": 20 }, { "epoch": 0.01, "grad_norm": 4.661373849976143, "learning_rate": 3.9622641509433963e-07, "loss": 0.718, "step": 21 }, { "epoch": 0.01, "grad_norm": 6.23274723855891, "learning_rate": 4.150943396226415e-07, "loss": 0.8623, "step": 22 }, { "epoch": 0.01, "grad_norm": 6.0704700228127635, "learning_rate": 4.339622641509434e-07, "loss": 0.8782, "step": 23 }, { "epoch": 0.01, "grad_norm": 5.700893588615956, "learning_rate": 4.5283018867924526e-07, "loss": 0.8101, "step": 24 }, { "epoch": 0.01, "grad_norm": 4.998135432510591, "learning_rate": 4.7169811320754717e-07, "loss": 0.7511, "step": 25 }, { "epoch": 0.01, "grad_norm": 5.217486426924381, "learning_rate": 4.90566037735849e-07, "loss": 0.8268, "step": 26 }, { "epoch": 0.01, "grad_norm": 5.0541443638950305, "learning_rate": 5.094339622641509e-07, "loss": 0.7494, "step": 27 }, { "epoch": 0.01, "grad_norm": 5.205266954508169, "learning_rate": 5.283018867924528e-07, "loss": 0.8054, "step": 28 }, { "epoch": 0.01, "grad_norm": 11.080559809465251, "learning_rate": 5.471698113207546e-07, "loss": 0.7088, "step": 29 }, { "epoch": 0.01, "grad_norm": 5.230197181619572, "learning_rate": 5.660377358490566e-07, "loss": 0.7552, "step": 30 }, { "epoch": 0.01, "grad_norm": 4.988538877448871, "learning_rate": 5.849056603773585e-07, "loss": 0.7455, "step": 31 }, { "epoch": 0.01, "grad_norm": 4.321562944292788, "learning_rate": 6.037735849056604e-07, "loss": 0.7366, "step": 32 }, { "epoch": 0.01, "grad_norm": 4.520956670013252, "learning_rate": 6.226415094339622e-07, "loss": 0.7305, "step": 33 }, { "epoch": 0.01, "grad_norm": 4.677218814857416, "learning_rate": 6.415094339622641e-07, "loss": 0.6962, "step": 34 }, { "epoch": 0.01, "grad_norm": 4.393544219908874, "learning_rate": 6.60377358490566e-07, "loss": 0.7401, "step": 35 }, { "epoch": 0.01, "grad_norm": 4.64954547771705, "learning_rate": 6.792452830188678e-07, "loss": 0.706, "step": 36 }, { "epoch": 0.01, "grad_norm": 4.274230930931085, "learning_rate": 6.981132075471697e-07, "loss": 0.7262, "step": 37 }, { "epoch": 0.01, "grad_norm": 3.907454657653952, "learning_rate": 7.169811320754716e-07, "loss": 0.6775, "step": 38 }, { "epoch": 0.01, "grad_norm": 4.169754686936093, "learning_rate": 7.358490566037735e-07, "loss": 0.6674, "step": 39 }, { "epoch": 0.01, "grad_norm": 4.464499900362431, "learning_rate": 7.547169811320754e-07, "loss": 0.6806, "step": 40 }, { "epoch": 0.01, "grad_norm": 4.097582890478711, "learning_rate": 7.735849056603774e-07, "loss": 0.7274, "step": 41 }, { "epoch": 0.01, "grad_norm": 3.697964008533405, "learning_rate": 7.924528301886793e-07, "loss": 0.6486, "step": 42 }, { "epoch": 0.01, "grad_norm": 3.7930403767612777, "learning_rate": 8.113207547169812e-07, "loss": 0.6157, "step": 43 }, { "epoch": 0.01, "grad_norm": 3.6371272191211172, "learning_rate": 8.30188679245283e-07, "loss": 0.5929, "step": 44 }, { "epoch": 0.01, "grad_norm": 3.801258780572625, "learning_rate": 8.490566037735849e-07, "loss": 0.5322, "step": 45 }, { "epoch": 0.01, "grad_norm": 3.301139979984435, "learning_rate": 8.679245283018868e-07, "loss": 0.5699, "step": 46 }, { "epoch": 0.01, "grad_norm": 3.498909620163726, "learning_rate": 8.867924528301887e-07, "loss": 0.5538, "step": 47 }, { "epoch": 0.01, "grad_norm": 3.994048255712527, "learning_rate": 9.056603773584905e-07, "loss": 0.5087, "step": 48 }, { "epoch": 0.01, "grad_norm": 3.5927147782012816, "learning_rate": 9.245283018867924e-07, "loss": 0.6009, "step": 49 }, { "epoch": 0.01, "grad_norm": 4.500724906785562, "learning_rate": 9.433962264150943e-07, "loss": 0.5618, "step": 50 }, { "epoch": 0.01, "grad_norm": 3.356984138931124, "learning_rate": 9.622641509433961e-07, "loss": 0.5287, "step": 51 }, { "epoch": 0.01, "grad_norm": 3.3735668806204133, "learning_rate": 9.81132075471698e-07, "loss": 0.5447, "step": 52 }, { "epoch": 0.02, "grad_norm": 3.3326495713070416, "learning_rate": 1e-06, "loss": 0.5317, "step": 53 }, { "epoch": 0.02, "grad_norm": 3.489263896611546, "learning_rate": 1.0188679245283019e-06, "loss": 0.5549, "step": 54 }, { "epoch": 0.02, "grad_norm": 3.1415448527151857, "learning_rate": 1.0377358490566038e-06, "loss": 0.5329, "step": 55 }, { "epoch": 0.02, "grad_norm": 3.1401388087716526, "learning_rate": 1.0566037735849057e-06, "loss": 0.5074, "step": 56 }, { "epoch": 0.02, "grad_norm": 3.2267416931558985, "learning_rate": 1.0754716981132074e-06, "loss": 0.5431, "step": 57 }, { "epoch": 0.02, "grad_norm": 2.978898769392214, "learning_rate": 1.0943396226415093e-06, "loss": 0.5433, "step": 58 }, { "epoch": 0.02, "grad_norm": 3.0011201623997574, "learning_rate": 1.1132075471698112e-06, "loss": 0.5155, "step": 59 }, { "epoch": 0.02, "grad_norm": 3.0273145674095665, "learning_rate": 1.1320754716981131e-06, "loss": 0.4837, "step": 60 }, { "epoch": 0.02, "grad_norm": 4.213734749879202, "learning_rate": 1.150943396226415e-06, "loss": 0.5608, "step": 61 }, { "epoch": 0.02, "grad_norm": 5.958909244082287, "learning_rate": 1.169811320754717e-06, "loss": 0.4858, "step": 62 }, { "epoch": 0.02, "grad_norm": 2.8152544386137444, "learning_rate": 1.1886792452830188e-06, "loss": 0.5165, "step": 63 }, { "epoch": 0.02, "grad_norm": 3.6384403894331143, "learning_rate": 1.2075471698113208e-06, "loss": 0.5016, "step": 64 }, { "epoch": 0.02, "grad_norm": 2.87730885494883, "learning_rate": 1.2264150943396225e-06, "loss": 0.4738, "step": 65 }, { "epoch": 0.02, "grad_norm": 3.113820837217724, "learning_rate": 1.2452830188679244e-06, "loss": 0.4554, "step": 66 }, { "epoch": 0.02, "grad_norm": 2.7669377573492886, "learning_rate": 1.2641509433962263e-06, "loss": 0.466, "step": 67 }, { "epoch": 0.02, "grad_norm": 3.2823683218193014, "learning_rate": 1.2830188679245282e-06, "loss": 0.4613, "step": 68 }, { "epoch": 0.02, "grad_norm": 4.673456593571675, "learning_rate": 1.30188679245283e-06, "loss": 0.4792, "step": 69 }, { "epoch": 0.02, "grad_norm": 2.792640744843451, "learning_rate": 1.320754716981132e-06, "loss": 0.4595, "step": 70 }, { "epoch": 0.02, "grad_norm": 3.8347185566760507, "learning_rate": 1.339622641509434e-06, "loss": 0.4462, "step": 71 }, { "epoch": 0.02, "grad_norm": 3.089668904583872, "learning_rate": 1.3584905660377356e-06, "loss": 0.5026, "step": 72 }, { "epoch": 0.02, "grad_norm": 3.2532898025993413, "learning_rate": 1.3773584905660375e-06, "loss": 0.4678, "step": 73 }, { "epoch": 0.02, "grad_norm": 3.9484661367267835, "learning_rate": 1.3962264150943394e-06, "loss": 0.488, "step": 74 }, { "epoch": 0.02, "grad_norm": 3.1818768846061185, "learning_rate": 1.4150943396226413e-06, "loss": 0.5166, "step": 75 }, { "epoch": 0.02, "grad_norm": 3.1426225216317043, "learning_rate": 1.4339622641509432e-06, "loss": 0.4996, "step": 76 }, { "epoch": 0.02, "grad_norm": 2.9238328814899393, "learning_rate": 1.4528301886792452e-06, "loss": 0.4289, "step": 77 }, { "epoch": 0.02, "grad_norm": 2.8360366336829568, "learning_rate": 1.471698113207547e-06, "loss": 0.4543, "step": 78 }, { "epoch": 0.02, "grad_norm": 3.6249196520915064, "learning_rate": 1.490566037735849e-06, "loss": 0.5045, "step": 79 }, { "epoch": 0.02, "grad_norm": 5.299329223688592, "learning_rate": 1.5094339622641509e-06, "loss": 0.4843, "step": 80 }, { "epoch": 0.02, "grad_norm": 2.787829361897964, "learning_rate": 1.5283018867924528e-06, "loss": 0.4455, "step": 81 }, { "epoch": 0.02, "grad_norm": 2.7277354760339874, "learning_rate": 1.5471698113207547e-06, "loss": 0.4194, "step": 82 }, { "epoch": 0.02, "grad_norm": 2.6600416644163922, "learning_rate": 1.5660377358490566e-06, "loss": 0.4048, "step": 83 }, { "epoch": 0.02, "grad_norm": 3.1774992527477286, "learning_rate": 1.5849056603773585e-06, "loss": 0.4497, "step": 84 }, { "epoch": 0.02, "grad_norm": 2.9439067645554107, "learning_rate": 1.6037735849056604e-06, "loss": 0.474, "step": 85 }, { "epoch": 0.02, "grad_norm": 2.8642919262683084, "learning_rate": 1.6226415094339623e-06, "loss": 0.4249, "step": 86 }, { "epoch": 0.02, "grad_norm": 2.774887911245674, "learning_rate": 1.6415094339622643e-06, "loss": 0.4294, "step": 87 }, { "epoch": 0.02, "grad_norm": 2.9695757117046395, "learning_rate": 1.660377358490566e-06, "loss": 0.4764, "step": 88 }, { "epoch": 0.03, "grad_norm": 2.945368803651741, "learning_rate": 1.6792452830188679e-06, "loss": 0.4517, "step": 89 }, { "epoch": 0.03, "grad_norm": 2.857389843927476, "learning_rate": 1.6981132075471698e-06, "loss": 0.4651, "step": 90 }, { "epoch": 0.03, "grad_norm": 3.0764711362725805, "learning_rate": 1.7169811320754717e-06, "loss": 0.4735, "step": 91 }, { "epoch": 0.03, "grad_norm": 2.887559820554778, "learning_rate": 1.7358490566037736e-06, "loss": 0.408, "step": 92 }, { "epoch": 0.03, "grad_norm": 3.7339438704382357, "learning_rate": 1.7547169811320755e-06, "loss": 0.4428, "step": 93 }, { "epoch": 0.03, "grad_norm": 2.725029573154123, "learning_rate": 1.7735849056603774e-06, "loss": 0.4066, "step": 94 }, { "epoch": 0.03, "grad_norm": 2.785369107959244, "learning_rate": 1.7924528301886791e-06, "loss": 0.4325, "step": 95 }, { "epoch": 0.03, "grad_norm": 2.693163705078504, "learning_rate": 1.811320754716981e-06, "loss": 0.4303, "step": 96 }, { "epoch": 0.03, "grad_norm": 2.815163383579835, "learning_rate": 1.830188679245283e-06, "loss": 0.409, "step": 97 }, { "epoch": 0.03, "grad_norm": 3.1482120663466073, "learning_rate": 1.8490566037735848e-06, "loss": 0.4413, "step": 98 }, { "epoch": 0.03, "grad_norm": 2.7451340037459535, "learning_rate": 1.8679245283018868e-06, "loss": 0.3909, "step": 99 }, { "epoch": 0.03, "grad_norm": 2.668547433739357, "learning_rate": 1.8867924528301887e-06, "loss": 0.4245, "step": 100 }, { "epoch": 0.03, "grad_norm": 3.4097066689240445, "learning_rate": 1.9056603773584906e-06, "loss": 0.4171, "step": 101 }, { "epoch": 0.03, "grad_norm": 2.7500799535422535, "learning_rate": 1.9245283018867923e-06, "loss": 0.4316, "step": 102 }, { "epoch": 0.03, "grad_norm": 2.772565736820654, "learning_rate": 1.9433962264150944e-06, "loss": 0.4308, "step": 103 }, { "epoch": 0.03, "grad_norm": 2.9388045054687497, "learning_rate": 1.962264150943396e-06, "loss": 0.3946, "step": 104 }, { "epoch": 0.03, "grad_norm": 2.6971415729142567, "learning_rate": 1.981132075471698e-06, "loss": 0.3971, "step": 105 }, { "epoch": 0.03, "grad_norm": 2.565759147084175, "learning_rate": 2e-06, "loss": 0.4335, "step": 106 }, { "epoch": 0.03, "grad_norm": 2.6533331797170074, "learning_rate": 1.9999995788314622e-06, "loss": 0.4027, "step": 107 }, { "epoch": 0.03, "grad_norm": 2.6400141953211516, "learning_rate": 1.9999983153262037e-06, "loss": 0.3566, "step": 108 }, { "epoch": 0.03, "grad_norm": 2.869629191400056, "learning_rate": 1.9999962094852885e-06, "loss": 0.4321, "step": 109 }, { "epoch": 0.03, "grad_norm": 3.1890954493242303, "learning_rate": 1.999993261310491e-06, "loss": 0.4243, "step": 110 }, { "epoch": 0.03, "grad_norm": 2.771919519749318, "learning_rate": 1.999989470804294e-06, "loss": 0.4053, "step": 111 }, { "epoch": 0.03, "grad_norm": 3.3182106784918077, "learning_rate": 1.9999848379698903e-06, "loss": 0.4299, "step": 112 }, { "epoch": 0.03, "grad_norm": 2.809056771173883, "learning_rate": 1.999979362811183e-06, "loss": 0.4323, "step": 113 }, { "epoch": 0.03, "grad_norm": 2.8114945027835807, "learning_rate": 1.9999730453327834e-06, "loss": 0.4382, "step": 114 }, { "epoch": 0.03, "grad_norm": 2.7106485289513396, "learning_rate": 1.9999658855400133e-06, "loss": 0.4195, "step": 115 }, { "epoch": 0.03, "grad_norm": 3.0700900731543523, "learning_rate": 1.9999578834389034e-06, "loss": 0.4007, "step": 116 }, { "epoch": 0.03, "grad_norm": 2.581255352359972, "learning_rate": 1.9999490390361944e-06, "loss": 0.4003, "step": 117 }, { "epoch": 0.03, "grad_norm": 2.730583061252001, "learning_rate": 1.9999393523393364e-06, "loss": 0.3988, "step": 118 }, { "epoch": 0.03, "grad_norm": 2.953836584220443, "learning_rate": 1.999928823356488e-06, "loss": 0.4003, "step": 119 }, { "epoch": 0.03, "grad_norm": 2.540162372030467, "learning_rate": 1.9999174520965193e-06, "loss": 0.4193, "step": 120 }, { "epoch": 0.03, "grad_norm": 2.8247216227110754, "learning_rate": 1.9999052385690078e-06, "loss": 0.4106, "step": 121 }, { "epoch": 0.03, "grad_norm": 2.9800895294923313, "learning_rate": 1.999892182784242e-06, "loss": 0.425, "step": 122 }, { "epoch": 0.03, "grad_norm": 2.9580274438434895, "learning_rate": 1.9998782847532195e-06, "loss": 0.4222, "step": 123 }, { "epoch": 0.04, "grad_norm": 2.978550415936864, "learning_rate": 1.9998635444876458e-06, "loss": 0.4031, "step": 124 }, { "epoch": 0.04, "grad_norm": 2.8082748015723213, "learning_rate": 1.999847961999939e-06, "loss": 0.4056, "step": 125 }, { "epoch": 0.04, "grad_norm": 2.8595783375198818, "learning_rate": 1.9998315373032237e-06, "loss": 0.4224, "step": 126 }, { "epoch": 0.04, "grad_norm": 2.6890260114894686, "learning_rate": 1.9998142704113346e-06, "loss": 0.3496, "step": 127 }, { "epoch": 0.04, "grad_norm": 2.7450486041972693, "learning_rate": 1.9997961613388173e-06, "loss": 0.3901, "step": 128 }, { "epoch": 0.04, "grad_norm": 3.109712683225939, "learning_rate": 1.9997772101009253e-06, "loss": 0.4559, "step": 129 }, { "epoch": 0.04, "grad_norm": 2.8203576670652284, "learning_rate": 1.9997574167136223e-06, "loss": 0.3763, "step": 130 }, { "epoch": 0.04, "grad_norm": 2.5062427163533143, "learning_rate": 1.9997367811935805e-06, "loss": 0.3815, "step": 131 }, { "epoch": 0.04, "grad_norm": 2.7117271095711204, "learning_rate": 1.999715303558182e-06, "loss": 0.3941, "step": 132 }, { "epoch": 0.04, "grad_norm": 2.55987610733269, "learning_rate": 1.999692983825518e-06, "loss": 0.3927, "step": 133 }, { "epoch": 0.04, "grad_norm": 3.3555921974167564, "learning_rate": 1.99966982201439e-06, "loss": 0.3758, "step": 134 }, { "epoch": 0.04, "grad_norm": 2.808757885402719, "learning_rate": 1.999645818144307e-06, "loss": 0.4095, "step": 135 }, { "epoch": 0.04, "grad_norm": 2.68635260987047, "learning_rate": 1.9996209722354897e-06, "loss": 0.3608, "step": 136 }, { "epoch": 0.04, "grad_norm": 2.9268085169131046, "learning_rate": 1.9995952843088656e-06, "loss": 0.4159, "step": 137 }, { "epoch": 0.04, "grad_norm": 2.9484082206625333, "learning_rate": 1.9995687543860728e-06, "loss": 0.4132, "step": 138 }, { "epoch": 0.04, "grad_norm": 2.646373861612375, "learning_rate": 1.999541382489459e-06, "loss": 0.3747, "step": 139 }, { "epoch": 0.04, "grad_norm": 2.646592594686166, "learning_rate": 1.9995131686420798e-06, "loss": 0.3996, "step": 140 }, { "epoch": 0.04, "grad_norm": 2.7303459670357704, "learning_rate": 1.9994841128677014e-06, "loss": 0.3988, "step": 141 }, { "epoch": 0.04, "grad_norm": 2.799829009143507, "learning_rate": 1.9994542151907985e-06, "loss": 0.4117, "step": 142 }, { "epoch": 0.04, "grad_norm": 2.6018316616627173, "learning_rate": 1.9994234756365546e-06, "loss": 0.3918, "step": 143 }, { "epoch": 0.04, "grad_norm": 2.8528313633077835, "learning_rate": 1.9993918942308625e-06, "loss": 0.3901, "step": 144 }, { "epoch": 0.04, "grad_norm": 3.206126131387853, "learning_rate": 1.999359471000326e-06, "loss": 0.3858, "step": 145 }, { "epoch": 0.04, "grad_norm": 2.6182854163185243, "learning_rate": 1.9993262059722546e-06, "loss": 0.4106, "step": 146 }, { "epoch": 0.04, "grad_norm": 3.002116351588365, "learning_rate": 1.9992920991746695e-06, "loss": 0.4153, "step": 147 }, { "epoch": 0.04, "grad_norm": 2.744545644279597, "learning_rate": 1.9992571506362995e-06, "loss": 0.3962, "step": 148 }, { "epoch": 0.04, "grad_norm": 2.9977459472060466, "learning_rate": 1.999221360386584e-06, "loss": 0.3931, "step": 149 }, { "epoch": 0.04, "grad_norm": 2.6424581529573303, "learning_rate": 1.99918472845567e-06, "loss": 0.3706, "step": 150 }, { "epoch": 0.04, "grad_norm": 2.7069491277467383, "learning_rate": 1.999147254874414e-06, "loss": 0.4147, "step": 151 }, { "epoch": 0.04, "grad_norm": 2.6035821918848363, "learning_rate": 1.9991089396743805e-06, "loss": 0.4249, "step": 152 }, { "epoch": 0.04, "grad_norm": 2.8388566955132943, "learning_rate": 1.999069782887845e-06, "loss": 0.3695, "step": 153 }, { "epoch": 0.04, "grad_norm": 2.7846849561486064, "learning_rate": 1.999029784547791e-06, "loss": 0.4046, "step": 154 }, { "epoch": 0.04, "grad_norm": 2.5672347312163266, "learning_rate": 1.998988944687909e-06, "loss": 0.3784, "step": 155 }, { "epoch": 0.04, "grad_norm": 2.5262276088682456, "learning_rate": 1.998947263342601e-06, "loss": 0.3624, "step": 156 }, { "epoch": 0.04, "grad_norm": 2.760289710063222, "learning_rate": 1.998904740546977e-06, "loss": 0.3976, "step": 157 }, { "epoch": 0.04, "grad_norm": 2.6197959186468402, "learning_rate": 1.9988613763368545e-06, "loss": 0.3852, "step": 158 }, { "epoch": 0.05, "grad_norm": 2.9089432590991646, "learning_rate": 1.998817170748762e-06, "loss": 0.3799, "step": 159 }, { "epoch": 0.05, "grad_norm": 2.506679497101866, "learning_rate": 1.9987721238199343e-06, "loss": 0.3535, "step": 160 }, { "epoch": 0.05, "grad_norm": 2.825720790926751, "learning_rate": 1.9987262355883173e-06, "loss": 0.3744, "step": 161 }, { "epoch": 0.05, "grad_norm": 2.906788715253664, "learning_rate": 1.9986795060925633e-06, "loss": 0.4155, "step": 162 }, { "epoch": 0.05, "grad_norm": 3.099980135318557, "learning_rate": 1.998631935372035e-06, "loss": 0.4039, "step": 163 }, { "epoch": 0.05, "grad_norm": 2.473077408954635, "learning_rate": 1.9985835234668023e-06, "loss": 0.3994, "step": 164 }, { "epoch": 0.05, "grad_norm": 2.7379960293103736, "learning_rate": 1.998534270417645e-06, "loss": 0.4067, "step": 165 }, { "epoch": 0.05, "grad_norm": 2.934946021108072, "learning_rate": 1.9984841762660503e-06, "loss": 0.3915, "step": 166 }, { "epoch": 0.05, "grad_norm": 2.595844563943777, "learning_rate": 1.998433241054215e-06, "loss": 0.3917, "step": 167 }, { "epoch": 0.05, "grad_norm": 2.7121653357980597, "learning_rate": 1.998381464825043e-06, "loss": 0.3971, "step": 168 }, { "epoch": 0.05, "grad_norm": 2.584381509862202, "learning_rate": 1.998328847622148e-06, "loss": 0.3716, "step": 169 }, { "epoch": 0.05, "grad_norm": 2.6766288563674414, "learning_rate": 1.9982753894898506e-06, "loss": 0.3798, "step": 170 }, { "epoch": 0.05, "grad_norm": 2.5571445545817957, "learning_rate": 1.9982210904731812e-06, "loss": 0.3643, "step": 171 }, { "epoch": 0.05, "grad_norm": 2.5407475772937818, "learning_rate": 1.9981659506178776e-06, "loss": 0.3839, "step": 172 }, { "epoch": 0.05, "grad_norm": 2.5544944850006597, "learning_rate": 1.9981099699703864e-06, "loss": 0.368, "step": 173 }, { "epoch": 0.05, "grad_norm": 2.531812353121465, "learning_rate": 1.998053148577862e-06, "loss": 0.3964, "step": 174 }, { "epoch": 0.05, "grad_norm": 3.046992089059834, "learning_rate": 1.997995486488167e-06, "loss": 0.3844, "step": 175 }, { "epoch": 0.05, "grad_norm": 2.822735955792875, "learning_rate": 1.9979369837498727e-06, "loss": 0.3991, "step": 176 }, { "epoch": 0.05, "grad_norm": 2.9314069703471657, "learning_rate": 1.997877640412258e-06, "loss": 0.401, "step": 177 }, { "epoch": 0.05, "grad_norm": 2.504699351071927, "learning_rate": 1.9978174565253095e-06, "loss": 0.3805, "step": 178 }, { "epoch": 0.05, "grad_norm": 2.566177801931028, "learning_rate": 1.9977564321397233e-06, "loss": 0.437, "step": 179 }, { "epoch": 0.05, "grad_norm": 2.6868768656926396, "learning_rate": 1.9976945673069015e-06, "loss": 0.3873, "step": 180 }, { "epoch": 0.05, "grad_norm": 2.456556033093542, "learning_rate": 1.9976318620789557e-06, "loss": 0.3185, "step": 181 }, { "epoch": 0.05, "grad_norm": 2.590060188752798, "learning_rate": 1.9975683165087047e-06, "loss": 0.3522, "step": 182 }, { "epoch": 0.05, "grad_norm": 2.840213817668187, "learning_rate": 1.9975039306496755e-06, "loss": 0.3667, "step": 183 }, { "epoch": 0.05, "grad_norm": 2.8933404886544967, "learning_rate": 1.997438704556102e-06, "loss": 0.37, "step": 184 }, { "epoch": 0.05, "grad_norm": 2.7526331247943334, "learning_rate": 1.997372638282928e-06, "loss": 0.3968, "step": 185 }, { "epoch": 0.05, "grad_norm": 2.6258818690706627, "learning_rate": 1.9973057318858017e-06, "loss": 0.3279, "step": 186 }, { "epoch": 0.05, "grad_norm": 2.7649018240097503, "learning_rate": 1.9972379854210823e-06, "loss": 0.3689, "step": 187 }, { "epoch": 0.05, "grad_norm": 2.7999331736072066, "learning_rate": 1.9971693989458345e-06, "loss": 0.3441, "step": 188 }, { "epoch": 0.05, "grad_norm": 2.7258205465442513, "learning_rate": 1.997099972517831e-06, "loss": 0.3396, "step": 189 }, { "epoch": 0.05, "grad_norm": 2.536508380657664, "learning_rate": 1.997029706195553e-06, "loss": 0.3346, "step": 190 }, { "epoch": 0.05, "grad_norm": 2.958855867705031, "learning_rate": 1.9969586000381883e-06, "loss": 0.3903, "step": 191 }, { "epoch": 0.05, "grad_norm": 2.7156228414424066, "learning_rate": 1.9968866541056313e-06, "loss": 0.3706, "step": 192 }, { "epoch": 0.05, "grad_norm": 2.665030631253663, "learning_rate": 1.996813868458486e-06, "loss": 0.3883, "step": 193 }, { "epoch": 0.05, "grad_norm": 2.6313799332939225, "learning_rate": 1.9967402431580617e-06, "loss": 0.3527, "step": 194 }, { "epoch": 0.06, "grad_norm": 2.7364262838936404, "learning_rate": 1.996665778266376e-06, "loss": 0.3676, "step": 195 }, { "epoch": 0.06, "grad_norm": 2.810935136720703, "learning_rate": 1.996590473846153e-06, "loss": 0.3341, "step": 196 }, { "epoch": 0.06, "grad_norm": 2.9192563229960538, "learning_rate": 1.996514329960825e-06, "loss": 0.3536, "step": 197 }, { "epoch": 0.06, "grad_norm": 2.8207777067651287, "learning_rate": 1.9964373466745307e-06, "loss": 0.3828, "step": 198 }, { "epoch": 0.06, "grad_norm": 3.1120567282258955, "learning_rate": 1.9963595240521156e-06, "loss": 0.386, "step": 199 }, { "epoch": 0.06, "grad_norm": 2.6529947961693527, "learning_rate": 1.996280862159133e-06, "loss": 0.3888, "step": 200 }, { "epoch": 0.06, "grad_norm": 3.0993864773394, "learning_rate": 1.996201361061842e-06, "loss": 0.4037, "step": 201 }, { "epoch": 0.06, "grad_norm": 2.5729470692760734, "learning_rate": 1.9961210208272105e-06, "loss": 0.3721, "step": 202 }, { "epoch": 0.06, "grad_norm": 2.8037555614398397, "learning_rate": 1.996039841522911e-06, "loss": 0.3861, "step": 203 }, { "epoch": 0.06, "grad_norm": 2.502805445419668, "learning_rate": 1.9959578232173245e-06, "loss": 0.3421, "step": 204 }, { "epoch": 0.06, "grad_norm": 2.6892317392586462, "learning_rate": 1.995874965979538e-06, "loss": 0.3703, "step": 205 }, { "epoch": 0.06, "grad_norm": 2.649856195234877, "learning_rate": 1.9957912698793447e-06, "loss": 0.3965, "step": 206 }, { "epoch": 0.06, "grad_norm": 2.6385637045784005, "learning_rate": 1.9957067349872456e-06, "loss": 0.355, "step": 207 }, { "epoch": 0.06, "grad_norm": 2.6041098458460032, "learning_rate": 1.995621361374447e-06, "loss": 0.4005, "step": 208 }, { "epoch": 0.06, "grad_norm": 2.791585851603161, "learning_rate": 1.995535149112862e-06, "loss": 0.3828, "step": 209 }, { "epoch": 0.06, "grad_norm": 2.56933881840692, "learning_rate": 1.995448098275112e-06, "loss": 0.3715, "step": 210 }, { "epoch": 0.06, "grad_norm": 2.7409053496946076, "learning_rate": 1.9953602089345213e-06, "loss": 0.385, "step": 211 }, { "epoch": 0.06, "grad_norm": 2.916462909652704, "learning_rate": 1.995271481165123e-06, "loss": 0.3491, "step": 212 }, { "epoch": 0.06, "grad_norm": 2.509746621164135, "learning_rate": 1.9951819150416564e-06, "loss": 0.3333, "step": 213 }, { "epoch": 0.06, "grad_norm": 2.491097202916147, "learning_rate": 1.9950915106395654e-06, "loss": 0.3349, "step": 214 }, { "epoch": 0.06, "grad_norm": 2.842582458464046, "learning_rate": 1.9950002680350016e-06, "loss": 0.3447, "step": 215 }, { "epoch": 0.06, "grad_norm": 2.6721882016425553, "learning_rate": 1.994908187304822e-06, "loss": 0.3676, "step": 216 }, { "epoch": 0.06, "grad_norm": 2.5656168078227406, "learning_rate": 1.9948152685265892e-06, "loss": 0.3781, "step": 217 }, { "epoch": 0.06, "grad_norm": 2.625977722419885, "learning_rate": 1.9947215117785727e-06, "loss": 0.3553, "step": 218 }, { "epoch": 0.06, "grad_norm": 2.845684023469767, "learning_rate": 1.9946269171397465e-06, "loss": 0.3676, "step": 219 }, { "epoch": 0.06, "grad_norm": 3.006378204445308, "learning_rate": 1.994531484689792e-06, "loss": 0.3609, "step": 220 }, { "epoch": 0.06, "grad_norm": 2.734959302329953, "learning_rate": 1.994435214509095e-06, "loss": 0.3979, "step": 221 }, { "epoch": 0.06, "grad_norm": 2.8769028978485194, "learning_rate": 1.994338106678748e-06, "loss": 0.3968, "step": 222 }, { "epoch": 0.06, "grad_norm": 2.7379521677523693, "learning_rate": 1.9942401612805477e-06, "loss": 0.433, "step": 223 }, { "epoch": 0.06, "grad_norm": 2.6234046844572374, "learning_rate": 1.9941413783969976e-06, "loss": 0.3595, "step": 224 }, { "epoch": 0.06, "grad_norm": 2.554494207301178, "learning_rate": 1.994041758111306e-06, "loss": 0.382, "step": 225 }, { "epoch": 0.06, "grad_norm": 2.579323406962154, "learning_rate": 1.993941300507387e-06, "loss": 0.3521, "step": 226 }, { "epoch": 0.06, "grad_norm": 2.7795940548748352, "learning_rate": 1.9938400056698595e-06, "loss": 0.3617, "step": 227 }, { "epoch": 0.06, "grad_norm": 3.0176541561507246, "learning_rate": 1.9937378736840483e-06, "loss": 0.4024, "step": 228 }, { "epoch": 0.06, "grad_norm": 2.7077194486209333, "learning_rate": 1.993634904635983e-06, "loss": 0.3594, "step": 229 }, { "epoch": 0.07, "grad_norm": 2.4884581742716905, "learning_rate": 1.9935310986123976e-06, "loss": 0.3308, "step": 230 }, { "epoch": 0.07, "grad_norm": 2.4268425735478183, "learning_rate": 1.993426455700732e-06, "loss": 0.3419, "step": 231 }, { "epoch": 0.07, "grad_norm": 2.665186874653042, "learning_rate": 1.993320975989131e-06, "loss": 0.3426, "step": 232 }, { "epoch": 0.07, "grad_norm": 2.552819929036354, "learning_rate": 1.9932146595664446e-06, "loss": 0.387, "step": 233 }, { "epoch": 0.07, "grad_norm": 3.4824302393777335, "learning_rate": 1.993107506522226e-06, "loss": 0.3213, "step": 234 }, { "epoch": 0.07, "grad_norm": 3.1076257763007455, "learning_rate": 1.9929995169467344e-06, "loss": 0.3862, "step": 235 }, { "epoch": 0.07, "grad_norm": 2.7868572688950537, "learning_rate": 1.992890690930934e-06, "loss": 0.3902, "step": 236 }, { "epoch": 0.07, "grad_norm": 2.527196091690139, "learning_rate": 1.9927810285664927e-06, "loss": 0.3258, "step": 237 }, { "epoch": 0.07, "grad_norm": 2.5385263313762674, "learning_rate": 1.9926705299457827e-06, "loss": 0.3925, "step": 238 }, { "epoch": 0.07, "grad_norm": 2.6446574691522353, "learning_rate": 1.992559195161882e-06, "loss": 0.37, "step": 239 }, { "epoch": 0.07, "grad_norm": 2.4843804067774076, "learning_rate": 1.9924470243085712e-06, "loss": 0.3525, "step": 240 }, { "epoch": 0.07, "grad_norm": 2.463079722299446, "learning_rate": 1.9923340174803367e-06, "loss": 0.328, "step": 241 }, { "epoch": 0.07, "grad_norm": 2.5139165394391894, "learning_rate": 1.9922201747723677e-06, "loss": 0.3499, "step": 242 }, { "epoch": 0.07, "grad_norm": 2.6807859647502177, "learning_rate": 1.9921054962805585e-06, "loss": 0.3419, "step": 243 }, { "epoch": 0.07, "grad_norm": 2.9749725307944406, "learning_rate": 1.9919899821015063e-06, "loss": 0.3248, "step": 244 }, { "epoch": 0.07, "grad_norm": 2.521006728376108, "learning_rate": 1.9918736323325142e-06, "loss": 0.3565, "step": 245 }, { "epoch": 0.07, "grad_norm": 2.690164522320936, "learning_rate": 1.9917564470715872e-06, "loss": 0.3598, "step": 246 }, { "epoch": 0.07, "grad_norm": 2.6434030217227464, "learning_rate": 1.991638426417435e-06, "loss": 0.3628, "step": 247 }, { "epoch": 0.07, "grad_norm": 2.406254480523089, "learning_rate": 1.991519570469471e-06, "loss": 0.3412, "step": 248 }, { "epoch": 0.07, "grad_norm": 2.8193314112019814, "learning_rate": 1.9913998793278113e-06, "loss": 0.367, "step": 249 }, { "epoch": 0.07, "grad_norm": 2.4988815520050247, "learning_rate": 1.9912793530932764e-06, "loss": 0.3622, "step": 250 }, { "epoch": 0.07, "grad_norm": 2.461527020103545, "learning_rate": 1.9911579918673904e-06, "loss": 0.3299, "step": 251 }, { "epoch": 0.07, "grad_norm": 3.773311619061041, "learning_rate": 1.99103579575238e-06, "loss": 0.3449, "step": 252 }, { "epoch": 0.07, "grad_norm": 2.5782349089610825, "learning_rate": 1.9909127648511754e-06, "loss": 0.3594, "step": 253 }, { "epoch": 0.07, "grad_norm": 2.4446994500795194, "learning_rate": 1.990788899267411e-06, "loss": 0.344, "step": 254 }, { "epoch": 0.07, "grad_norm": 2.6855473726715022, "learning_rate": 1.9906641991054222e-06, "loss": 0.4129, "step": 255 }, { "epoch": 0.07, "grad_norm": 2.5485938248498092, "learning_rate": 1.9905386644702493e-06, "loss": 0.3543, "step": 256 }, { "epoch": 0.07, "grad_norm": 2.5080000647003278, "learning_rate": 1.9904122954676345e-06, "loss": 0.3202, "step": 257 }, { "epoch": 0.07, "grad_norm": 2.457262360426059, "learning_rate": 1.9902850922040227e-06, "loss": 0.3579, "step": 258 }, { "epoch": 0.07, "grad_norm": 2.541677938349451, "learning_rate": 1.9901570547865627e-06, "loss": 0.3584, "step": 259 }, { "epoch": 0.07, "grad_norm": 2.6969360626762677, "learning_rate": 1.990028183323105e-06, "loss": 0.3887, "step": 260 }, { "epoch": 0.07, "grad_norm": 2.713143817573316, "learning_rate": 1.9898984779222025e-06, "loss": 0.3841, "step": 261 }, { "epoch": 0.07, "grad_norm": 2.5477445113524815, "learning_rate": 1.9897679386931113e-06, "loss": 0.3435, "step": 262 }, { "epoch": 0.07, "grad_norm": 3.2229779357171524, "learning_rate": 1.9896365657457887e-06, "loss": 0.3744, "step": 263 }, { "epoch": 0.07, "grad_norm": 2.613400938132588, "learning_rate": 1.989504359190896e-06, "loss": 0.3374, "step": 264 }, { "epoch": 0.08, "grad_norm": 2.80802734327446, "learning_rate": 1.989371319139794e-06, "loss": 0.3772, "step": 265 }, { "epoch": 0.08, "grad_norm": 2.4470246183799156, "learning_rate": 1.9892374457045494e-06, "loss": 0.3553, "step": 266 }, { "epoch": 0.08, "grad_norm": 2.6800638145741353, "learning_rate": 1.9891027389979278e-06, "loss": 0.3706, "step": 267 }, { "epoch": 0.08, "grad_norm": 2.913022801860264, "learning_rate": 1.9889671991333976e-06, "loss": 0.3766, "step": 268 }, { "epoch": 0.08, "grad_norm": 3.0319933246387856, "learning_rate": 1.9888308262251284e-06, "loss": 0.3663, "step": 269 }, { "epoch": 0.08, "grad_norm": 2.4223388136877886, "learning_rate": 1.9886936203879935e-06, "loss": 0.3418, "step": 270 }, { "epoch": 0.08, "grad_norm": 2.7509386532674815, "learning_rate": 1.9885555817375654e-06, "loss": 0.4096, "step": 271 }, { "epoch": 0.08, "grad_norm": 2.639110162050167, "learning_rate": 1.9884167103901194e-06, "loss": 0.39, "step": 272 }, { "epoch": 0.08, "grad_norm": 2.7803037778659814, "learning_rate": 1.9882770064626328e-06, "loss": 0.3795, "step": 273 }, { "epoch": 0.08, "grad_norm": 2.5513683067438753, "learning_rate": 1.988136470072782e-06, "loss": 0.3469, "step": 274 }, { "epoch": 0.08, "grad_norm": 2.535630029496729, "learning_rate": 1.987995101338947e-06, "loss": 0.3517, "step": 275 }, { "epoch": 0.08, "grad_norm": 2.6139021814657037, "learning_rate": 1.9878529003802084e-06, "loss": 0.3976, "step": 276 }, { "epoch": 0.08, "grad_norm": 2.483456591372453, "learning_rate": 1.987709867316346e-06, "loss": 0.3631, "step": 277 }, { "epoch": 0.08, "grad_norm": 2.5809926151280314, "learning_rate": 1.9875660022678424e-06, "loss": 0.3685, "step": 278 }, { "epoch": 0.08, "grad_norm": 2.545111593100663, "learning_rate": 1.9874213053558804e-06, "loss": 0.3427, "step": 279 }, { "epoch": 0.08, "grad_norm": 2.617356456075554, "learning_rate": 1.987275776702344e-06, "loss": 0.3395, "step": 280 }, { "epoch": 0.08, "grad_norm": 2.8211042738249743, "learning_rate": 1.987129416429817e-06, "loss": 0.3571, "step": 281 }, { "epoch": 0.08, "grad_norm": 7.370289089811191, "learning_rate": 1.986982224661584e-06, "loss": 0.3474, "step": 282 }, { "epoch": 0.08, "grad_norm": 2.5171030275465305, "learning_rate": 1.986834201521631e-06, "loss": 0.3306, "step": 283 }, { "epoch": 0.08, "grad_norm": 2.5654163958906837, "learning_rate": 1.9866853471346417e-06, "loss": 0.3568, "step": 284 }, { "epoch": 0.08, "grad_norm": 3.4580167416924765, "learning_rate": 1.986535661626003e-06, "loss": 0.368, "step": 285 }, { "epoch": 0.08, "grad_norm": 2.6005433341670887, "learning_rate": 1.9863851451218003e-06, "loss": 0.3518, "step": 286 }, { "epoch": 0.08, "grad_norm": 2.699225097426378, "learning_rate": 1.986233797748819e-06, "loss": 0.3784, "step": 287 }, { "epoch": 0.08, "grad_norm": 2.732736459682785, "learning_rate": 1.986081619634545e-06, "loss": 0.3752, "step": 288 }, { "epoch": 0.08, "grad_norm": 2.7938860793855747, "learning_rate": 1.9859286109071623e-06, "loss": 0.3502, "step": 289 }, { "epoch": 0.08, "grad_norm": 2.5092963016547123, "learning_rate": 1.985774771695558e-06, "loss": 0.3709, "step": 290 }, { "epoch": 0.08, "grad_norm": 2.8384058342986496, "learning_rate": 1.9856201021293148e-06, "loss": 0.3702, "step": 291 }, { "epoch": 0.08, "grad_norm": 2.7336616375679426, "learning_rate": 1.985464602338717e-06, "loss": 0.363, "step": 292 }, { "epoch": 0.08, "grad_norm": 2.5955531896252, "learning_rate": 1.9853082724547476e-06, "loss": 0.3036, "step": 293 }, { "epoch": 0.08, "grad_norm": 2.571917020052252, "learning_rate": 1.9851511126090906e-06, "loss": 0.361, "step": 294 }, { "epoch": 0.08, "grad_norm": 2.660892685172359, "learning_rate": 1.9849931229341256e-06, "loss": 0.325, "step": 295 }, { "epoch": 0.08, "grad_norm": 2.3648358197445933, "learning_rate": 1.9848343035629343e-06, "loss": 0.3532, "step": 296 }, { "epoch": 0.08, "grad_norm": 2.594871579879767, "learning_rate": 1.9846746546292955e-06, "loss": 0.3413, "step": 297 }, { "epoch": 0.08, "grad_norm": 2.5793998255691766, "learning_rate": 1.9845141762676882e-06, "loss": 0.3335, "step": 298 }, { "epoch": 0.08, "grad_norm": 2.591077271608269, "learning_rate": 1.984352868613289e-06, "loss": 0.3675, "step": 299 }, { "epoch": 0.09, "grad_norm": 3.285884192859506, "learning_rate": 1.9841907318019724e-06, "loss": 0.3539, "step": 300 }, { "epoch": 0.09, "grad_norm": 2.5448166312144047, "learning_rate": 1.9840277659703137e-06, "loss": 0.3542, "step": 301 }, { "epoch": 0.09, "grad_norm": 2.470915634055657, "learning_rate": 1.9838639712555838e-06, "loss": 0.3646, "step": 302 }, { "epoch": 0.09, "grad_norm": 2.5716228188887387, "learning_rate": 1.9836993477957536e-06, "loss": 0.3493, "step": 303 }, { "epoch": 0.09, "grad_norm": 2.643931078864786, "learning_rate": 1.983533895729492e-06, "loss": 0.3195, "step": 304 }, { "epoch": 0.09, "grad_norm": 2.911939697435264, "learning_rate": 1.9833676151961647e-06, "loss": 0.3654, "step": 305 }, { "epoch": 0.09, "grad_norm": 2.3724724165880784, "learning_rate": 1.9832005063358366e-06, "loss": 0.3603, "step": 306 }, { "epoch": 0.09, "grad_norm": 2.5039300276236034, "learning_rate": 1.9830325692892687e-06, "loss": 0.3552, "step": 307 }, { "epoch": 0.09, "grad_norm": 2.5223232113937843, "learning_rate": 1.9828638041979216e-06, "loss": 0.3327, "step": 308 }, { "epoch": 0.09, "grad_norm": 2.9855800223800464, "learning_rate": 1.982694211203952e-06, "loss": 0.3829, "step": 309 }, { "epoch": 0.09, "grad_norm": 2.521775265390254, "learning_rate": 1.9825237904502143e-06, "loss": 0.3408, "step": 310 }, { "epoch": 0.09, "grad_norm": 2.7867318408681685, "learning_rate": 1.98235254208026e-06, "loss": 0.3465, "step": 311 }, { "epoch": 0.09, "grad_norm": 2.617705586883914, "learning_rate": 1.9821804662383385e-06, "loss": 0.3625, "step": 312 }, { "epoch": 0.09, "grad_norm": 3.1720347849017676, "learning_rate": 1.982007563069395e-06, "loss": 0.3589, "step": 313 }, { "epoch": 0.09, "grad_norm": 2.575250459612025, "learning_rate": 1.9818338327190735e-06, "loss": 0.341, "step": 314 }, { "epoch": 0.09, "grad_norm": 2.878553661168574, "learning_rate": 1.981659275333712e-06, "loss": 0.3398, "step": 315 }, { "epoch": 0.09, "grad_norm": 2.552552883120015, "learning_rate": 1.981483891060348e-06, "loss": 0.3343, "step": 316 }, { "epoch": 0.09, "grad_norm": 2.5137157940173256, "learning_rate": 1.981307680046713e-06, "loss": 0.3576, "step": 317 }, { "epoch": 0.09, "grad_norm": 2.403478162113704, "learning_rate": 1.9811306424412368e-06, "loss": 0.3313, "step": 318 }, { "epoch": 0.09, "grad_norm": 2.5219076719388847, "learning_rate": 1.9809527783930442e-06, "loss": 0.3498, "step": 319 }, { "epoch": 0.09, "grad_norm": 2.46937865375049, "learning_rate": 1.980774088051957e-06, "loss": 0.3305, "step": 320 }, { "epoch": 0.09, "grad_norm": 2.4507535819330233, "learning_rate": 1.980594571568493e-06, "loss": 0.3154, "step": 321 }, { "epoch": 0.09, "grad_norm": 2.5843540111215284, "learning_rate": 1.980414229093865e-06, "loss": 0.3583, "step": 322 }, { "epoch": 0.09, "grad_norm": 2.442105603481713, "learning_rate": 1.980233060779983e-06, "loss": 0.3347, "step": 323 }, { "epoch": 0.09, "grad_norm": 2.5856362077055297, "learning_rate": 1.9800510667794508e-06, "loss": 0.3265, "step": 324 }, { "epoch": 0.09, "grad_norm": 2.780782385254357, "learning_rate": 1.979868247245569e-06, "loss": 0.3502, "step": 325 }, { "epoch": 0.09, "grad_norm": 2.5778120808206366, "learning_rate": 1.9796846023323335e-06, "loss": 0.3259, "step": 326 }, { "epoch": 0.09, "grad_norm": 3.454138511692687, "learning_rate": 1.979500132194435e-06, "loss": 0.3468, "step": 327 }, { "epoch": 0.09, "grad_norm": 2.4070384710440167, "learning_rate": 1.97931483698726e-06, "loss": 0.3279, "step": 328 }, { "epoch": 0.09, "grad_norm": 2.834842983135936, "learning_rate": 1.979128716866889e-06, "loss": 0.3356, "step": 329 }, { "epoch": 0.09, "grad_norm": 2.578244568214759, "learning_rate": 1.978941771990098e-06, "loss": 0.3452, "step": 330 }, { "epoch": 0.09, "grad_norm": 2.628476335289947, "learning_rate": 1.9787540025143576e-06, "loss": 0.3495, "step": 331 }, { "epoch": 0.09, "grad_norm": 2.5526394737282962, "learning_rate": 1.9785654085978328e-06, "loss": 0.3528, "step": 332 }, { "epoch": 0.09, "grad_norm": 2.671559049988823, "learning_rate": 1.978375990399384e-06, "loss": 0.3705, "step": 333 }, { "epoch": 0.09, "grad_norm": 2.5473894352720237, "learning_rate": 1.9781857480785644e-06, "loss": 0.3192, "step": 334 }, { "epoch": 0.09, "grad_norm": 2.5989571364649473, "learning_rate": 1.9779946817956223e-06, "loss": 0.3616, "step": 335 }, { "epoch": 0.1, "grad_norm": 2.795031591430215, "learning_rate": 1.9778027917115005e-06, "loss": 0.3821, "step": 336 }, { "epoch": 0.1, "grad_norm": 2.4775590847102986, "learning_rate": 1.9776100779878343e-06, "loss": 0.3527, "step": 337 }, { "epoch": 0.1, "grad_norm": 2.7144133030062814, "learning_rate": 1.9774165407869535e-06, "loss": 0.3803, "step": 338 }, { "epoch": 0.1, "grad_norm": 2.7376378600974713, "learning_rate": 1.977222180271883e-06, "loss": 0.3447, "step": 339 }, { "epoch": 0.1, "grad_norm": 2.5409133266582233, "learning_rate": 1.9770269966063388e-06, "loss": 0.3393, "step": 340 }, { "epoch": 0.1, "grad_norm": 2.6836638084477937, "learning_rate": 1.976830989954731e-06, "loss": 0.3491, "step": 341 }, { "epoch": 0.1, "grad_norm": 2.642805261995146, "learning_rate": 1.9766341604821643e-06, "loss": 0.3471, "step": 342 }, { "epoch": 0.1, "grad_norm": 2.598394203262401, "learning_rate": 1.976436508354435e-06, "loss": 0.3603, "step": 343 }, { "epoch": 0.1, "grad_norm": 2.382257852100494, "learning_rate": 1.9762380337380325e-06, "loss": 0.3502, "step": 344 }, { "epoch": 0.1, "grad_norm": 2.632050658022369, "learning_rate": 1.97603873680014e-06, "loss": 0.3648, "step": 345 }, { "epoch": 0.1, "grad_norm": 2.750819827854785, "learning_rate": 1.975838617708632e-06, "loss": 0.3334, "step": 346 }, { "epoch": 0.1, "grad_norm": 3.410781673584302, "learning_rate": 1.975637676632077e-06, "loss": 0.3417, "step": 347 }, { "epoch": 0.1, "grad_norm": 2.629343266531438, "learning_rate": 1.975435913739734e-06, "loss": 0.3896, "step": 348 }, { "epoch": 0.1, "grad_norm": 2.472893659048711, "learning_rate": 1.9752333292015564e-06, "loss": 0.3457, "step": 349 }, { "epoch": 0.1, "grad_norm": 2.6266932605282713, "learning_rate": 1.9750299231881882e-06, "loss": 0.369, "step": 350 }, { "epoch": 0.1, "grad_norm": 2.655745774986425, "learning_rate": 1.974825695870966e-06, "loss": 0.3468, "step": 351 }, { "epoch": 0.1, "grad_norm": 2.4865033854087613, "learning_rate": 1.974620647421918e-06, "loss": 0.3326, "step": 352 }, { "epoch": 0.1, "grad_norm": 2.536995229766425, "learning_rate": 1.9744147780137644e-06, "loss": 0.3384, "step": 353 }, { "epoch": 0.1, "grad_norm": 2.5297575083663606, "learning_rate": 1.9742080878199155e-06, "loss": 0.3284, "step": 354 }, { "epoch": 0.1, "grad_norm": 2.3922716483502056, "learning_rate": 1.9740005770144757e-06, "loss": 0.3624, "step": 355 }, { "epoch": 0.1, "grad_norm": 2.4959650916452087, "learning_rate": 1.973792245772238e-06, "loss": 0.3541, "step": 356 }, { "epoch": 0.1, "grad_norm": 2.7223874523007403, "learning_rate": 1.9735830942686877e-06, "loss": 0.3142, "step": 357 }, { "epoch": 0.1, "grad_norm": 2.636175207703715, "learning_rate": 1.973373122680001e-06, "loss": 0.3735, "step": 358 }, { "epoch": 0.1, "grad_norm": 2.5918184199941066, "learning_rate": 1.973162331183045e-06, "loss": 0.3524, "step": 359 }, { "epoch": 0.1, "grad_norm": 2.4828658779223347, "learning_rate": 1.9729507199553766e-06, "loss": 0.3092, "step": 360 }, { "epoch": 0.1, "grad_norm": 2.4424967985660193, "learning_rate": 1.9727382891752444e-06, "loss": 0.3256, "step": 361 }, { "epoch": 0.1, "grad_norm": 2.74882206494836, "learning_rate": 1.972525039021586e-06, "loss": 0.3465, "step": 362 }, { "epoch": 0.1, "grad_norm": 2.5803729023474986, "learning_rate": 1.9723109696740307e-06, "loss": 0.3697, "step": 363 }, { "epoch": 0.1, "grad_norm": 2.5314535016182607, "learning_rate": 1.9720960813128963e-06, "loss": 0.385, "step": 364 }, { "epoch": 0.1, "grad_norm": 2.5334837256809553, "learning_rate": 1.971880374119192e-06, "loss": 0.3436, "step": 365 }, { "epoch": 0.1, "grad_norm": 2.838981459900644, "learning_rate": 1.971663848274615e-06, "loss": 0.3547, "step": 366 }, { "epoch": 0.1, "grad_norm": 2.785342579119847, "learning_rate": 1.971446503961554e-06, "loss": 0.3829, "step": 367 }, { "epoch": 0.1, "grad_norm": 2.4462261571298303, "learning_rate": 1.9712283413630863e-06, "loss": 0.3461, "step": 368 }, { "epoch": 0.1, "grad_norm": 2.5548288894996096, "learning_rate": 1.9710093606629773e-06, "loss": 0.3542, "step": 369 }, { "epoch": 0.1, "grad_norm": 3.2724105962356234, "learning_rate": 1.970789562045683e-06, "loss": 0.3621, "step": 370 }, { "epoch": 0.11, "grad_norm": 2.593756143692595, "learning_rate": 1.970568945696348e-06, "loss": 0.347, "step": 371 }, { "epoch": 0.11, "grad_norm": 2.4998898782445944, "learning_rate": 1.970347511800806e-06, "loss": 0.3494, "step": 372 }, { "epoch": 0.11, "grad_norm": 2.429214718940558, "learning_rate": 1.9701252605455783e-06, "loss": 0.3171, "step": 373 }, { "epoch": 0.11, "grad_norm": 2.7538511396476717, "learning_rate": 1.969902192117876e-06, "loss": 0.3328, "step": 374 }, { "epoch": 0.11, "grad_norm": 2.6048648567873838, "learning_rate": 1.9696783067055978e-06, "loss": 0.3158, "step": 375 }, { "epoch": 0.11, "grad_norm": 2.5474822093572587, "learning_rate": 1.96945360449733e-06, "loss": 0.332, "step": 376 }, { "epoch": 0.11, "grad_norm": 2.5836149604108907, "learning_rate": 1.9692280856823484e-06, "loss": 0.3482, "step": 377 }, { "epoch": 0.11, "grad_norm": 3.0864417183648216, "learning_rate": 1.9690017504506155e-06, "loss": 0.3451, "step": 378 }, { "epoch": 0.11, "grad_norm": 2.6700788713740815, "learning_rate": 1.9687745989927824e-06, "loss": 0.3512, "step": 379 }, { "epoch": 0.11, "grad_norm": 2.763428118675348, "learning_rate": 1.968546631500186e-06, "loss": 0.3783, "step": 380 }, { "epoch": 0.11, "grad_norm": 2.6666814788251756, "learning_rate": 1.9683178481648527e-06, "loss": 0.3135, "step": 381 }, { "epoch": 0.11, "grad_norm": 2.410152816700471, "learning_rate": 1.968088249179495e-06, "loss": 0.3304, "step": 382 }, { "epoch": 0.11, "grad_norm": 2.514956754492286, "learning_rate": 1.967857834737513e-06, "loss": 0.3086, "step": 383 }, { "epoch": 0.11, "grad_norm": 2.403693065158659, "learning_rate": 1.9676266050329922e-06, "loss": 0.3082, "step": 384 }, { "epoch": 0.11, "grad_norm": 2.5751977044730494, "learning_rate": 1.967394560260707e-06, "loss": 0.3369, "step": 385 }, { "epoch": 0.11, "grad_norm": 2.5583174802608997, "learning_rate": 1.967161700616117e-06, "loss": 0.3181, "step": 386 }, { "epoch": 0.11, "grad_norm": 2.6557837593729072, "learning_rate": 1.966928026295369e-06, "loss": 0.3771, "step": 387 }, { "epoch": 0.11, "grad_norm": 3.5991375063276094, "learning_rate": 1.9666935374952944e-06, "loss": 0.3262, "step": 388 }, { "epoch": 0.11, "grad_norm": 2.6405353323745278, "learning_rate": 1.9664582344134127e-06, "loss": 0.3636, "step": 389 }, { "epoch": 0.11, "grad_norm": 2.4486516403012857, "learning_rate": 1.9662221172479282e-06, "loss": 0.3404, "step": 390 }, { "epoch": 0.11, "grad_norm": 2.708565335228964, "learning_rate": 1.965985186197731e-06, "loss": 0.3737, "step": 391 }, { "epoch": 0.11, "grad_norm": 2.6111093488966133, "learning_rate": 1.965747441462397e-06, "loss": 0.3577, "step": 392 }, { "epoch": 0.11, "grad_norm": 2.4352883871602913, "learning_rate": 1.965508883242188e-06, "loss": 0.3583, "step": 393 }, { "epoch": 0.11, "grad_norm": 2.6020531043744404, "learning_rate": 1.965269511738049e-06, "loss": 0.3169, "step": 394 }, { "epoch": 0.11, "grad_norm": 2.841460231837411, "learning_rate": 1.965029327151613e-06, "loss": 0.3649, "step": 395 }, { "epoch": 0.11, "grad_norm": 2.25286900315995, "learning_rate": 1.9647883296851956e-06, "loss": 0.3119, "step": 396 }, { "epoch": 0.11, "grad_norm": 2.522094707467322, "learning_rate": 1.964546519541798e-06, "loss": 0.3345, "step": 397 }, { "epoch": 0.11, "grad_norm": 2.443044826914561, "learning_rate": 1.9643038969251062e-06, "loss": 0.3387, "step": 398 }, { "epoch": 0.11, "grad_norm": 2.4333924844267014, "learning_rate": 1.9640604620394894e-06, "loss": 0.3527, "step": 399 }, { "epoch": 0.11, "grad_norm": 2.6188283476819763, "learning_rate": 1.9638162150900025e-06, "loss": 0.3755, "step": 400 }, { "epoch": 0.11, "grad_norm": 3.1294437997674893, "learning_rate": 1.963571156282384e-06, "loss": 0.3146, "step": 401 }, { "epoch": 0.11, "grad_norm": 2.462574569192702, "learning_rate": 1.963325285823055e-06, "loss": 0.3475, "step": 402 }, { "epoch": 0.11, "grad_norm": 2.424115414008482, "learning_rate": 1.9630786039191225e-06, "loss": 0.3551, "step": 403 }, { "epoch": 0.11, "grad_norm": 2.7183638186252166, "learning_rate": 1.962831110778375e-06, "loss": 0.3235, "step": 404 }, { "epoch": 0.11, "grad_norm": 2.707298913677284, "learning_rate": 1.9625828066092854e-06, "loss": 0.3626, "step": 405 }, { "epoch": 0.12, "grad_norm": 3.051683309395247, "learning_rate": 1.9623336916210096e-06, "loss": 0.3174, "step": 406 }, { "epoch": 0.12, "grad_norm": 2.6084332006277027, "learning_rate": 1.962083766023386e-06, "loss": 0.3158, "step": 407 }, { "epoch": 0.12, "grad_norm": 2.351302315368594, "learning_rate": 1.961833030026937e-06, "loss": 0.284, "step": 408 }, { "epoch": 0.12, "grad_norm": 2.3788504441528806, "learning_rate": 1.961581483842866e-06, "loss": 0.3278, "step": 409 }, { "epoch": 0.12, "grad_norm": 2.8946442484890107, "learning_rate": 1.96132912768306e-06, "loss": 0.3163, "step": 410 }, { "epoch": 0.12, "grad_norm": 2.728285381971823, "learning_rate": 1.961075961760088e-06, "loss": 0.3418, "step": 411 }, { "epoch": 0.12, "grad_norm": 2.640098159654451, "learning_rate": 1.9608219862872008e-06, "loss": 0.3264, "step": 412 }, { "epoch": 0.12, "grad_norm": 2.5466803175986557, "learning_rate": 1.960567201478332e-06, "loss": 0.3122, "step": 413 }, { "epoch": 0.12, "grad_norm": 2.5571107605288512, "learning_rate": 1.9603116075480955e-06, "loss": 0.3582, "step": 414 }, { "epoch": 0.12, "grad_norm": 2.6335058487995013, "learning_rate": 1.960055204711788e-06, "loss": 0.3319, "step": 415 }, { "epoch": 0.12, "grad_norm": 2.499711088348462, "learning_rate": 1.959797993185387e-06, "loss": 0.3553, "step": 416 }, { "epoch": 0.12, "grad_norm": 2.3778807191941955, "learning_rate": 1.959539973185551e-06, "loss": 0.332, "step": 417 }, { "epoch": 0.12, "grad_norm": 2.6344006456744054, "learning_rate": 1.9592811449296206e-06, "loss": 0.3572, "step": 418 }, { "epoch": 0.12, "grad_norm": 2.4980884179301603, "learning_rate": 1.9590215086356155e-06, "loss": 0.2792, "step": 419 }, { "epoch": 0.12, "grad_norm": 2.7675481866770877, "learning_rate": 1.9587610645222377e-06, "loss": 0.3385, "step": 420 }, { "epoch": 0.12, "grad_norm": 2.4770423980229457, "learning_rate": 1.9584998128088683e-06, "loss": 0.341, "step": 421 }, { "epoch": 0.12, "grad_norm": 2.6038105597999173, "learning_rate": 1.9582377537155703e-06, "loss": 0.3269, "step": 422 }, { "epoch": 0.12, "grad_norm": 2.618927817603957, "learning_rate": 1.9579748874630846e-06, "loss": 0.332, "step": 423 }, { "epoch": 0.12, "grad_norm": 2.5224312088303065, "learning_rate": 1.9577112142728337e-06, "loss": 0.3191, "step": 424 }, { "epoch": 0.12, "grad_norm": 2.410236418801495, "learning_rate": 1.95744673436692e-06, "loss": 0.3188, "step": 425 }, { "epoch": 0.12, "grad_norm": 2.5211092188943076, "learning_rate": 1.9571814479681233e-06, "loss": 0.3545, "step": 426 }, { "epoch": 0.12, "grad_norm": 2.6366198905561125, "learning_rate": 1.9569153552999053e-06, "loss": 0.3413, "step": 427 }, { "epoch": 0.12, "grad_norm": 2.3273141341492978, "learning_rate": 1.9566484565864056e-06, "loss": 0.3178, "step": 428 }, { "epoch": 0.12, "grad_norm": 2.464296948249515, "learning_rate": 1.9563807520524424e-06, "loss": 0.3227, "step": 429 }, { "epoch": 0.12, "grad_norm": 2.5770345250979836, "learning_rate": 1.9561122419235133e-06, "loss": 0.3275, "step": 430 }, { "epoch": 0.12, "grad_norm": 2.806790500176456, "learning_rate": 1.9558429264257946e-06, "loss": 0.3654, "step": 431 }, { "epoch": 0.12, "grad_norm": 2.6954362891186325, "learning_rate": 1.955572805786141e-06, "loss": 0.2983, "step": 432 }, { "epoch": 0.12, "grad_norm": 2.5608755561769727, "learning_rate": 1.955301880232084e-06, "loss": 0.3729, "step": 433 }, { "epoch": 0.12, "grad_norm": 2.578420783299757, "learning_rate": 1.9550301499918353e-06, "loss": 0.328, "step": 434 }, { "epoch": 0.12, "grad_norm": 2.6160850303665457, "learning_rate": 1.9547576152942825e-06, "loss": 0.3917, "step": 435 }, { "epoch": 0.12, "grad_norm": 2.582252428734102, "learning_rate": 1.9544842763689928e-06, "loss": 0.3299, "step": 436 }, { "epoch": 0.12, "grad_norm": 2.634871783199364, "learning_rate": 1.9542101334462086e-06, "loss": 0.3125, "step": 437 }, { "epoch": 0.12, "grad_norm": 2.6720251585718655, "learning_rate": 1.9539351867568515e-06, "loss": 0.3618, "step": 438 }, { "epoch": 0.12, "grad_norm": 2.701084613749105, "learning_rate": 1.953659436532519e-06, "loss": 0.3332, "step": 439 }, { "epoch": 0.12, "grad_norm": 2.586301761737129, "learning_rate": 1.953382883005485e-06, "loss": 0.3434, "step": 440 }, { "epoch": 0.12, "grad_norm": 2.5520563354383428, "learning_rate": 1.953105526408702e-06, "loss": 0.3136, "step": 441 }, { "epoch": 0.13, "grad_norm": 2.7054210509581953, "learning_rate": 1.952827366975797e-06, "loss": 0.3614, "step": 442 }, { "epoch": 0.13, "grad_norm": 2.377738480595085, "learning_rate": 1.9525484049410745e-06, "loss": 0.3406, "step": 443 }, { "epoch": 0.13, "grad_norm": 2.393036741968658, "learning_rate": 1.952268640539514e-06, "loss": 0.3094, "step": 444 }, { "epoch": 0.13, "grad_norm": 2.8081828257349892, "learning_rate": 1.951988074006772e-06, "loss": 0.313, "step": 445 }, { "epoch": 0.13, "grad_norm": 2.4443193318497127, "learning_rate": 1.951706705579179e-06, "loss": 0.3388, "step": 446 }, { "epoch": 0.13, "grad_norm": 2.6083418173169153, "learning_rate": 1.9514245354937434e-06, "loss": 0.3416, "step": 447 }, { "epoch": 0.13, "grad_norm": 2.482240802319246, "learning_rate": 1.951141563988147e-06, "loss": 0.3184, "step": 448 }, { "epoch": 0.13, "grad_norm": 2.573942965254348, "learning_rate": 1.9508577913007472e-06, "loss": 0.3288, "step": 449 }, { "epoch": 0.13, "grad_norm": 2.520122116076561, "learning_rate": 1.9505732176705762e-06, "loss": 0.3395, "step": 450 }, { "epoch": 0.13, "grad_norm": 2.5535464035474194, "learning_rate": 1.9502878433373404e-06, "loss": 0.3257, "step": 451 }, { "epoch": 0.13, "grad_norm": 2.5785359969521116, "learning_rate": 1.9500016685414223e-06, "loss": 0.3444, "step": 452 }, { "epoch": 0.13, "grad_norm": 2.6932421940193616, "learning_rate": 1.9497146935238767e-06, "loss": 0.3103, "step": 453 }, { "epoch": 0.13, "grad_norm": 2.5976063721804574, "learning_rate": 1.949426918526434e-06, "loss": 0.3376, "step": 454 }, { "epoch": 0.13, "grad_norm": 2.768636398677477, "learning_rate": 1.9491383437914964e-06, "loss": 0.3263, "step": 455 }, { "epoch": 0.13, "grad_norm": 2.4290109643427487, "learning_rate": 1.9488489695621427e-06, "loss": 0.3428, "step": 456 }, { "epoch": 0.13, "grad_norm": 2.356160466466906, "learning_rate": 1.9485587960821227e-06, "loss": 0.2786, "step": 457 }, { "epoch": 0.13, "grad_norm": 2.410697728515682, "learning_rate": 1.9482678235958605e-06, "loss": 0.3311, "step": 458 }, { "epoch": 0.13, "grad_norm": 2.7601789133630934, "learning_rate": 1.9479760523484526e-06, "loss": 0.3668, "step": 459 }, { "epoch": 0.13, "grad_norm": 3.2475066691008267, "learning_rate": 1.9476834825856695e-06, "loss": 0.3236, "step": 460 }, { "epoch": 0.13, "grad_norm": 2.7004065801678983, "learning_rate": 1.947390114553953e-06, "loss": 0.3747, "step": 461 }, { "epoch": 0.13, "grad_norm": 2.4762707217582633, "learning_rate": 1.947095948500418e-06, "loss": 0.3298, "step": 462 }, { "epoch": 0.13, "grad_norm": 2.359836990811463, "learning_rate": 1.946800984672851e-06, "loss": 0.3274, "step": 463 }, { "epoch": 0.13, "grad_norm": 2.475164850692892, "learning_rate": 1.946505223319712e-06, "loss": 0.3076, "step": 464 }, { "epoch": 0.13, "grad_norm": 2.497063237032677, "learning_rate": 1.946208664690131e-06, "loss": 0.3366, "step": 465 }, { "epoch": 0.13, "grad_norm": 2.5993339212468367, "learning_rate": 1.9459113090339107e-06, "loss": 0.3546, "step": 466 }, { "epoch": 0.13, "grad_norm": 2.522352594749957, "learning_rate": 1.945613156601524e-06, "loss": 0.3454, "step": 467 }, { "epoch": 0.13, "grad_norm": 2.4833483392268674, "learning_rate": 1.945314207644117e-06, "loss": 0.3262, "step": 468 }, { "epoch": 0.13, "grad_norm": 2.715300511368663, "learning_rate": 1.9450144624135047e-06, "loss": 0.3532, "step": 469 }, { "epoch": 0.13, "grad_norm": 2.668453574347607, "learning_rate": 1.944713921162174e-06, "loss": 0.3488, "step": 470 }, { "epoch": 0.13, "grad_norm": 2.401790874153332, "learning_rate": 1.9444125841432814e-06, "loss": 0.3355, "step": 471 }, { "epoch": 0.13, "grad_norm": 2.5611493361085746, "learning_rate": 1.944110451610655e-06, "loss": 0.3469, "step": 472 }, { "epoch": 0.13, "grad_norm": 2.649852662518354, "learning_rate": 1.9438075238187914e-06, "loss": 0.3461, "step": 473 }, { "epoch": 0.13, "grad_norm": 2.5142573134371973, "learning_rate": 1.9435038010228583e-06, "loss": 0.3577, "step": 474 }, { "epoch": 0.13, "grad_norm": 2.6489862422940784, "learning_rate": 1.9431992834786925e-06, "loss": 0.3052, "step": 475 }, { "epoch": 0.13, "grad_norm": 16.540757693889148, "learning_rate": 1.9428939714428008e-06, "loss": 0.3543, "step": 476 }, { "epoch": 0.14, "grad_norm": 2.630121182626526, "learning_rate": 1.9425878651723587e-06, "loss": 0.3327, "step": 477 }, { "epoch": 0.14, "grad_norm": 2.6496095013669674, "learning_rate": 1.9422809649252107e-06, "loss": 0.3809, "step": 478 }, { "epoch": 0.14, "grad_norm": 2.5789313072631086, "learning_rate": 1.9419732709598705e-06, "loss": 0.3517, "step": 479 }, { "epoch": 0.14, "grad_norm": 2.5779662665291494, "learning_rate": 1.94166478353552e-06, "loss": 0.3393, "step": 480 }, { "epoch": 0.14, "grad_norm": 2.463068866924456, "learning_rate": 1.9413555029120096e-06, "loss": 0.3126, "step": 481 }, { "epoch": 0.14, "grad_norm": 2.4759062085258847, "learning_rate": 1.9410454293498573e-06, "loss": 0.3286, "step": 482 }, { "epoch": 0.14, "grad_norm": 2.376938714024481, "learning_rate": 1.9407345631102507e-06, "loss": 0.3082, "step": 483 }, { "epoch": 0.14, "grad_norm": 2.480444918068485, "learning_rate": 1.940422904455043e-06, "loss": 0.3557, "step": 484 }, { "epoch": 0.14, "grad_norm": 2.653839263450258, "learning_rate": 1.9401104536467562e-06, "loss": 0.3158, "step": 485 }, { "epoch": 0.14, "grad_norm": 2.6730631024757936, "learning_rate": 1.93979721094858e-06, "loss": 0.3582, "step": 486 }, { "epoch": 0.14, "grad_norm": 2.516049593333345, "learning_rate": 1.9394831766243685e-06, "loss": 0.3389, "step": 487 }, { "epoch": 0.14, "grad_norm": 2.8083636392126374, "learning_rate": 1.9391683509386457e-06, "loss": 0.3247, "step": 488 }, { "epoch": 0.14, "grad_norm": 2.2678950733089147, "learning_rate": 1.9388527341566008e-06, "loss": 0.3186, "step": 489 }, { "epoch": 0.14, "grad_norm": 2.3716071271350714, "learning_rate": 1.9385363265440895e-06, "loss": 0.2982, "step": 490 }, { "epoch": 0.14, "grad_norm": 2.4167013747271455, "learning_rate": 1.9382191283676333e-06, "loss": 0.3442, "step": 491 }, { "epoch": 0.14, "grad_norm": 2.4025041454549023, "learning_rate": 1.9379011398944207e-06, "loss": 0.3188, "step": 492 }, { "epoch": 0.14, "grad_norm": 2.515862732197645, "learning_rate": 1.9375823613923047e-06, "loss": 0.3273, "step": 493 }, { "epoch": 0.14, "grad_norm": 2.613914145600926, "learning_rate": 1.937262793129804e-06, "loss": 0.3309, "step": 494 }, { "epoch": 0.14, "grad_norm": 2.578527306014049, "learning_rate": 1.9369424353761033e-06, "loss": 0.3301, "step": 495 }, { "epoch": 0.14, "grad_norm": 2.536979840648795, "learning_rate": 1.936621288401052e-06, "loss": 0.3524, "step": 496 }, { "epoch": 0.14, "grad_norm": 2.537778567527244, "learning_rate": 1.9362993524751632e-06, "loss": 0.3478, "step": 497 }, { "epoch": 0.14, "grad_norm": 2.767407350887436, "learning_rate": 1.9359766278696163e-06, "loss": 0.3498, "step": 498 }, { "epoch": 0.14, "grad_norm": 2.3407625140305086, "learning_rate": 1.9356531148562537e-06, "loss": 0.2906, "step": 499 }, { "epoch": 0.14, "grad_norm": 2.21835283736995, "learning_rate": 1.9353288137075827e-06, "loss": 0.3053, "step": 500 }, { "epoch": 0.14, "grad_norm": 2.6764139240001645, "learning_rate": 1.935003724696774e-06, "loss": 0.3378, "step": 501 }, { "epoch": 0.14, "grad_norm": 2.848173934867263, "learning_rate": 1.9346778480976625e-06, "loss": 0.3188, "step": 502 }, { "epoch": 0.14, "grad_norm": 2.721824420607853, "learning_rate": 1.9343511841847455e-06, "loss": 0.3694, "step": 503 }, { "epoch": 0.14, "grad_norm": 2.4137958474476844, "learning_rate": 1.9340237332331844e-06, "loss": 0.3081, "step": 504 }, { "epoch": 0.14, "grad_norm": 2.304436601106214, "learning_rate": 1.933695495518804e-06, "loss": 0.3085, "step": 505 }, { "epoch": 0.14, "grad_norm": 2.660920458664747, "learning_rate": 1.9333664713180897e-06, "loss": 0.3328, "step": 506 }, { "epoch": 0.14, "grad_norm": 2.675836492265816, "learning_rate": 1.933036660908192e-06, "loss": 0.3514, "step": 507 }, { "epoch": 0.14, "grad_norm": 2.592652582435975, "learning_rate": 1.932706064566922e-06, "loss": 0.3551, "step": 508 }, { "epoch": 0.14, "grad_norm": 2.45395931857169, "learning_rate": 1.932374682572753e-06, "loss": 0.3036, "step": 509 }, { "epoch": 0.14, "grad_norm": 2.495991500842149, "learning_rate": 1.9320425152048202e-06, "loss": 0.3299, "step": 510 }, { "epoch": 0.14, "grad_norm": 2.5722593523020048, "learning_rate": 1.9317095627429214e-06, "loss": 0.3039, "step": 511 }, { "epoch": 0.15, "grad_norm": 2.699696114708531, "learning_rate": 1.931375825467514e-06, "loss": 0.3371, "step": 512 }, { "epoch": 0.15, "grad_norm": 2.550768314398241, "learning_rate": 1.9310413036597178e-06, "loss": 0.336, "step": 513 }, { "epoch": 0.15, "grad_norm": 2.415423337002907, "learning_rate": 1.9307059976013125e-06, "loss": 0.3084, "step": 514 }, { "epoch": 0.15, "grad_norm": 3.3333411433452578, "learning_rate": 1.930369907574739e-06, "loss": 0.3311, "step": 515 }, { "epoch": 0.15, "grad_norm": 2.6221986764230607, "learning_rate": 1.9300330338630982e-06, "loss": 0.3579, "step": 516 }, { "epoch": 0.15, "grad_norm": 2.6611984074627193, "learning_rate": 1.929695376750152e-06, "loss": 0.3324, "step": 517 }, { "epoch": 0.15, "grad_norm": 2.3464416133083468, "learning_rate": 1.9293569365203202e-06, "loss": 0.3218, "step": 518 }, { "epoch": 0.15, "grad_norm": 2.397852657010271, "learning_rate": 1.9290177134586847e-06, "loss": 0.3483, "step": 519 }, { "epoch": 0.15, "grad_norm": 2.507353885884589, "learning_rate": 1.9286777078509856e-06, "loss": 0.3258, "step": 520 }, { "epoch": 0.15, "grad_norm": 2.5193974148260367, "learning_rate": 1.928336919983622e-06, "loss": 0.3549, "step": 521 }, { "epoch": 0.15, "grad_norm": 2.5488782265641943, "learning_rate": 1.9279953501436516e-06, "loss": 0.3011, "step": 522 }, { "epoch": 0.15, "grad_norm": 2.5085603852646186, "learning_rate": 1.927652998618792e-06, "loss": 0.322, "step": 523 }, { "epoch": 0.15, "grad_norm": 2.4263847131574203, "learning_rate": 1.927309865697419e-06, "loss": 0.3101, "step": 524 }, { "epoch": 0.15, "grad_norm": 2.3616115639087862, "learning_rate": 1.926965951668565e-06, "loss": 0.3095, "step": 525 }, { "epoch": 0.15, "grad_norm": 2.594453166745246, "learning_rate": 1.926621256821922e-06, "loss": 0.3514, "step": 526 }, { "epoch": 0.15, "grad_norm": 2.567525367017643, "learning_rate": 1.9262757814478397e-06, "loss": 0.3423, "step": 527 }, { "epoch": 0.15, "grad_norm": 2.4618364897492793, "learning_rate": 1.925929525837324e-06, "loss": 0.3388, "step": 528 }, { "epoch": 0.15, "grad_norm": 2.622140958588784, "learning_rate": 1.92558249028204e-06, "loss": 0.3613, "step": 529 }, { "epoch": 0.15, "grad_norm": 2.45449141356188, "learning_rate": 1.925234675074308e-06, "loss": 0.3137, "step": 530 }, { "epoch": 0.15, "grad_norm": 2.3719475881683727, "learning_rate": 1.9248860805071054e-06, "loss": 0.3193, "step": 531 }, { "epoch": 0.15, "grad_norm": 2.489830142014237, "learning_rate": 1.924536706874066e-06, "loss": 0.3379, "step": 532 }, { "epoch": 0.15, "grad_norm": 2.4421392911819737, "learning_rate": 1.9241865544694814e-06, "loss": 0.3191, "step": 533 }, { "epoch": 0.15, "grad_norm": 3.157619123600503, "learning_rate": 1.923835623588297e-06, "loss": 0.3257, "step": 534 }, { "epoch": 0.15, "grad_norm": 2.385666421992085, "learning_rate": 1.9234839145261152e-06, "loss": 0.2854, "step": 535 }, { "epoch": 0.15, "grad_norm": 2.9066780150364364, "learning_rate": 1.923131427579193e-06, "loss": 0.3441, "step": 536 }, { "epoch": 0.15, "grad_norm": 2.573671704791437, "learning_rate": 1.9227781630444444e-06, "loss": 0.3418, "step": 537 }, { "epoch": 0.15, "grad_norm": 2.395871838099826, "learning_rate": 1.9224241212194363e-06, "loss": 0.2987, "step": 538 }, { "epoch": 0.15, "grad_norm": 2.3230103292952684, "learning_rate": 1.9220693024023915e-06, "loss": 0.3357, "step": 539 }, { "epoch": 0.15, "grad_norm": 2.486607024771495, "learning_rate": 1.921713706892187e-06, "loss": 0.352, "step": 540 }, { "epoch": 0.15, "grad_norm": 2.395039371562957, "learning_rate": 1.9213573349883544e-06, "loss": 0.3222, "step": 541 }, { "epoch": 0.15, "grad_norm": 2.3744101515616243, "learning_rate": 1.9210001869910785e-06, "loss": 0.297, "step": 542 }, { "epoch": 0.15, "grad_norm": 2.8587800610376006, "learning_rate": 1.9206422632011987e-06, "loss": 0.3287, "step": 543 }, { "epoch": 0.15, "grad_norm": 3.3743691210117297, "learning_rate": 1.920283563920207e-06, "loss": 0.3331, "step": 544 }, { "epoch": 0.15, "grad_norm": 2.5527534687842115, "learning_rate": 1.9199240894502497e-06, "loss": 0.3361, "step": 545 }, { "epoch": 0.15, "grad_norm": 2.6134344899024025, "learning_rate": 1.919563840094125e-06, "loss": 0.3363, "step": 546 }, { "epoch": 0.15, "grad_norm": 2.321694668053294, "learning_rate": 1.9192028161552843e-06, "loss": 0.3191, "step": 547 }, { "epoch": 0.16, "grad_norm": 2.5342449599959207, "learning_rate": 1.918841017937832e-06, "loss": 0.3174, "step": 548 }, { "epoch": 0.16, "grad_norm": 2.413833093027609, "learning_rate": 1.9184784457465236e-06, "loss": 0.3133, "step": 549 }, { "epoch": 0.16, "grad_norm": 2.6117314393101876, "learning_rate": 1.918115099886767e-06, "loss": 0.2876, "step": 550 }, { "epoch": 0.16, "grad_norm": 2.5391976887583554, "learning_rate": 1.9177509806646224e-06, "loss": 0.334, "step": 551 }, { "epoch": 0.16, "grad_norm": 2.4016170632334592, "learning_rate": 1.9173860883868005e-06, "loss": 0.334, "step": 552 }, { "epoch": 0.16, "grad_norm": 3.6440124440068837, "learning_rate": 1.9170204233606638e-06, "loss": 0.3077, "step": 553 }, { "epoch": 0.16, "grad_norm": 2.482547368266431, "learning_rate": 1.9166539858942254e-06, "loss": 0.3164, "step": 554 }, { "epoch": 0.16, "grad_norm": 2.4041117231577642, "learning_rate": 1.9162867762961495e-06, "loss": 0.328, "step": 555 }, { "epoch": 0.16, "grad_norm": 2.564868045958623, "learning_rate": 1.91591879487575e-06, "loss": 0.3859, "step": 556 }, { "epoch": 0.16, "grad_norm": 2.4954454983835226, "learning_rate": 1.9155500419429915e-06, "loss": 0.3196, "step": 557 }, { "epoch": 0.16, "grad_norm": 2.2622655946584604, "learning_rate": 1.9151805178084877e-06, "loss": 0.2938, "step": 558 }, { "epoch": 0.16, "grad_norm": 2.5314647358932616, "learning_rate": 1.9148102227835032e-06, "loss": 0.3244, "step": 559 }, { "epoch": 0.16, "grad_norm": 2.378307639689325, "learning_rate": 1.9144391571799508e-06, "loss": 0.3212, "step": 560 }, { "epoch": 0.16, "grad_norm": 2.486662946691639, "learning_rate": 1.914067321310393e-06, "loss": 0.3252, "step": 561 }, { "epoch": 0.16, "grad_norm": 2.5296589870880304, "learning_rate": 1.9136947154880413e-06, "loss": 0.356, "step": 562 }, { "epoch": 0.16, "grad_norm": 2.308279577860115, "learning_rate": 1.9133213400267547e-06, "loss": 0.2777, "step": 563 }, { "epoch": 0.16, "grad_norm": 2.5577799075870113, "learning_rate": 1.9129471952410416e-06, "loss": 0.316, "step": 564 }, { "epoch": 0.16, "grad_norm": 2.4022008859887922, "learning_rate": 1.912572281446058e-06, "loss": 0.293, "step": 565 }, { "epoch": 0.16, "grad_norm": 2.7900327421429663, "learning_rate": 1.9121965989576074e-06, "loss": 0.3221, "step": 566 }, { "epoch": 0.16, "grad_norm": 2.302655008983288, "learning_rate": 1.9118201480921414e-06, "loss": 0.2883, "step": 567 }, { "epoch": 0.16, "grad_norm": 2.7137787292734403, "learning_rate": 1.911442929166758e-06, "loss": 0.3416, "step": 568 }, { "epoch": 0.16, "grad_norm": 2.4111683606362626, "learning_rate": 1.911064942499204e-06, "loss": 0.2932, "step": 569 }, { "epoch": 0.16, "grad_norm": 2.5828661156738106, "learning_rate": 1.91068618840787e-06, "loss": 0.341, "step": 570 }, { "epoch": 0.16, "grad_norm": 2.589921775852251, "learning_rate": 1.9103066672117954e-06, "loss": 0.3259, "step": 571 }, { "epoch": 0.16, "grad_norm": 2.6417147099420015, "learning_rate": 1.909926379230665e-06, "loss": 0.376, "step": 572 }, { "epoch": 0.16, "grad_norm": 2.388934604814071, "learning_rate": 1.9095453247848097e-06, "loss": 0.3073, "step": 573 }, { "epoch": 0.16, "grad_norm": 2.6524201982252116, "learning_rate": 1.909163504195205e-06, "loss": 0.3319, "step": 574 }, { "epoch": 0.16, "grad_norm": 2.4339126604553596, "learning_rate": 1.9087809177834733e-06, "loss": 0.3259, "step": 575 }, { "epoch": 0.16, "grad_norm": 2.5826745669652147, "learning_rate": 1.9083975658718804e-06, "loss": 0.362, "step": 576 }, { "epoch": 0.16, "grad_norm": 2.5029175314188823, "learning_rate": 1.908013448783339e-06, "loss": 0.3201, "step": 577 }, { "epoch": 0.16, "grad_norm": 2.3953466865243507, "learning_rate": 1.9076285668414042e-06, "loss": 0.3018, "step": 578 }, { "epoch": 0.16, "grad_norm": 2.432260404365039, "learning_rate": 1.907242920370277e-06, "loss": 0.3351, "step": 579 }, { "epoch": 0.16, "grad_norm": 2.421859217898423, "learning_rate": 1.9068565096948014e-06, "loss": 0.3264, "step": 580 }, { "epoch": 0.16, "grad_norm": 2.6404787924858732, "learning_rate": 1.9064693351404655e-06, "loss": 0.3268, "step": 581 }, { "epoch": 0.16, "grad_norm": 2.8951196157160384, "learning_rate": 1.9060813970334006e-06, "loss": 0.3217, "step": 582 }, { "epoch": 0.17, "grad_norm": 2.5423693712583666, "learning_rate": 1.9056926957003818e-06, "loss": 0.325, "step": 583 }, { "epoch": 0.17, "grad_norm": 2.6348173425693346, "learning_rate": 1.9053032314688261e-06, "loss": 0.3266, "step": 584 }, { "epoch": 0.17, "grad_norm": 2.335526260808841, "learning_rate": 1.904913004666794e-06, "loss": 0.3154, "step": 585 }, { "epoch": 0.17, "grad_norm": 2.379364811568399, "learning_rate": 1.904522015622988e-06, "loss": 0.3142, "step": 586 }, { "epoch": 0.17, "grad_norm": 3.378276249403308, "learning_rate": 1.9041302646667526e-06, "loss": 0.3054, "step": 587 }, { "epoch": 0.17, "grad_norm": 2.6371331096990946, "learning_rate": 1.903737752128074e-06, "loss": 0.3344, "step": 588 }, { "epoch": 0.17, "grad_norm": 2.567548383563427, "learning_rate": 1.9033444783375804e-06, "loss": 0.313, "step": 589 }, { "epoch": 0.17, "grad_norm": 2.666816454240892, "learning_rate": 1.9029504436265405e-06, "loss": 0.3282, "step": 590 }, { "epoch": 0.17, "grad_norm": 2.6100809481220226, "learning_rate": 1.9025556483268646e-06, "loss": 0.3456, "step": 591 }, { "epoch": 0.17, "grad_norm": 2.465309639776073, "learning_rate": 1.9021600927711035e-06, "loss": 0.3375, "step": 592 }, { "epoch": 0.17, "grad_norm": 2.3866245619525572, "learning_rate": 1.901763777292448e-06, "loss": 0.3103, "step": 593 }, { "epoch": 0.17, "grad_norm": 2.506109281722593, "learning_rate": 1.9013667022247295e-06, "loss": 0.3546, "step": 594 }, { "epoch": 0.17, "grad_norm": 2.4005233786171747, "learning_rate": 1.9009688679024189e-06, "loss": 0.3456, "step": 595 }, { "epoch": 0.17, "grad_norm": 2.252931620591618, "learning_rate": 1.900570274660627e-06, "loss": 0.3204, "step": 596 }, { "epoch": 0.17, "grad_norm": 2.5630832843404354, "learning_rate": 1.900170922835104e-06, "loss": 0.3014, "step": 597 }, { "epoch": 0.17, "grad_norm": 2.776046404382503, "learning_rate": 1.899770812762238e-06, "loss": 0.3166, "step": 598 }, { "epoch": 0.17, "grad_norm": 2.4769903180988444, "learning_rate": 1.8993699447790573e-06, "loss": 0.3285, "step": 599 }, { "epoch": 0.17, "grad_norm": 2.6638868801905664, "learning_rate": 1.8989683192232274e-06, "loss": 0.3336, "step": 600 }, { "epoch": 0.17, "grad_norm": 2.553206602922454, "learning_rate": 1.898565936433052e-06, "loss": 0.3442, "step": 601 }, { "epoch": 0.17, "grad_norm": 2.5268981785339535, "learning_rate": 1.8981627967474738e-06, "loss": 0.3122, "step": 602 }, { "epoch": 0.17, "grad_norm": 2.5220239590274005, "learning_rate": 1.8977589005060722e-06, "loss": 0.3039, "step": 603 }, { "epoch": 0.17, "grad_norm": 2.31394471539144, "learning_rate": 1.8973542480490634e-06, "loss": 0.3123, "step": 604 }, { "epoch": 0.17, "grad_norm": 2.391150021660033, "learning_rate": 1.8969488397173018e-06, "loss": 0.3363, "step": 605 }, { "epoch": 0.17, "grad_norm": 2.472462582484426, "learning_rate": 1.8965426758522779e-06, "loss": 0.306, "step": 606 }, { "epoch": 0.17, "grad_norm": 2.587624273773328, "learning_rate": 1.8961357567961178e-06, "loss": 0.3533, "step": 607 }, { "epoch": 0.17, "grad_norm": 2.347674821776924, "learning_rate": 1.8957280828915853e-06, "loss": 0.3416, "step": 608 }, { "epoch": 0.17, "grad_norm": 2.36594618155158, "learning_rate": 1.8953196544820789e-06, "loss": 0.3117, "step": 609 }, { "epoch": 0.17, "grad_norm": 2.3850526329845225, "learning_rate": 1.894910471911633e-06, "loss": 0.3458, "step": 610 }, { "epoch": 0.17, "grad_norm": 2.435275666201602, "learning_rate": 1.8945005355249175e-06, "loss": 0.3243, "step": 611 }, { "epoch": 0.17, "grad_norm": 2.4942240390673143, "learning_rate": 1.8940898456672368e-06, "loss": 0.3712, "step": 612 }, { "epoch": 0.17, "grad_norm": 2.6755213116047805, "learning_rate": 1.89367840268453e-06, "loss": 0.3011, "step": 613 }, { "epoch": 0.17, "grad_norm": 2.5560943374584237, "learning_rate": 1.8932662069233713e-06, "loss": 0.3243, "step": 614 }, { "epoch": 0.17, "grad_norm": 2.3265956737861715, "learning_rate": 1.892853258730968e-06, "loss": 0.3042, "step": 615 }, { "epoch": 0.17, "grad_norm": 2.387187177414009, "learning_rate": 1.892439558455162e-06, "loss": 0.3214, "step": 616 }, { "epoch": 0.17, "grad_norm": 2.592245862879042, "learning_rate": 1.892025106444428e-06, "loss": 0.3066, "step": 617 }, { "epoch": 0.18, "grad_norm": 2.334983691084106, "learning_rate": 1.8916099030478746e-06, "loss": 0.322, "step": 618 }, { "epoch": 0.18, "grad_norm": 2.4200119260792414, "learning_rate": 1.891193948615243e-06, "loss": 0.3414, "step": 619 }, { "epoch": 0.18, "grad_norm": 2.341872649046259, "learning_rate": 1.890777243496907e-06, "loss": 0.3028, "step": 620 }, { "epoch": 0.18, "grad_norm": 2.699959166733786, "learning_rate": 1.8903597880438727e-06, "loss": 0.3372, "step": 621 }, { "epoch": 0.18, "grad_norm": 2.6037992641562195, "learning_rate": 1.8899415826077783e-06, "loss": 0.3287, "step": 622 }, { "epoch": 0.18, "grad_norm": 2.5600828901372816, "learning_rate": 1.8895226275408937e-06, "loss": 0.3081, "step": 623 }, { "epoch": 0.18, "grad_norm": 2.47503393467837, "learning_rate": 1.8891029231961207e-06, "loss": 0.3134, "step": 624 }, { "epoch": 0.18, "grad_norm": 2.475124180370412, "learning_rate": 1.8886824699269912e-06, "loss": 0.3327, "step": 625 }, { "epoch": 0.18, "grad_norm": 2.4558851933431, "learning_rate": 1.8882612680876689e-06, "loss": 0.3099, "step": 626 }, { "epoch": 0.18, "grad_norm": 2.4377750490408023, "learning_rate": 1.887839318032948e-06, "loss": 0.3065, "step": 627 }, { "epoch": 0.18, "grad_norm": 2.4888520087900017, "learning_rate": 1.8874166201182522e-06, "loss": 0.2961, "step": 628 }, { "epoch": 0.18, "grad_norm": 2.429511339842882, "learning_rate": 1.8869931746996358e-06, "loss": 0.2766, "step": 629 }, { "epoch": 0.18, "grad_norm": 2.517207986809363, "learning_rate": 1.8865689821337825e-06, "loss": 0.3325, "step": 630 }, { "epoch": 0.18, "grad_norm": 2.5414615608781967, "learning_rate": 1.8861440427780058e-06, "loss": 0.3545, "step": 631 }, { "epoch": 0.18, "grad_norm": 6.389922704628554, "learning_rate": 1.8857183569902473e-06, "loss": 0.3134, "step": 632 }, { "epoch": 0.18, "grad_norm": 2.469158788196905, "learning_rate": 1.8852919251290783e-06, "loss": 0.3327, "step": 633 }, { "epoch": 0.18, "grad_norm": 2.4225156287436476, "learning_rate": 1.884864747553698e-06, "loss": 0.3232, "step": 634 }, { "epoch": 0.18, "grad_norm": 2.541582435994063, "learning_rate": 1.884436824623934e-06, "loss": 0.3067, "step": 635 }, { "epoch": 0.18, "grad_norm": 2.324430056321435, "learning_rate": 1.8840081567002417e-06, "loss": 0.3344, "step": 636 }, { "epoch": 0.18, "grad_norm": 2.4445507644563325, "learning_rate": 1.883578744143704e-06, "loss": 0.322, "step": 637 }, { "epoch": 0.18, "grad_norm": 2.398784855142958, "learning_rate": 1.8831485873160312e-06, "loss": 0.2996, "step": 638 }, { "epoch": 0.18, "grad_norm": 2.490567840851795, "learning_rate": 1.8827176865795596e-06, "loss": 0.3261, "step": 639 }, { "epoch": 0.18, "grad_norm": 2.736732509195614, "learning_rate": 1.8822860422972534e-06, "loss": 0.3633, "step": 640 }, { "epoch": 0.18, "grad_norm": 2.298233280198889, "learning_rate": 1.8818536548327026e-06, "loss": 0.3252, "step": 641 }, { "epoch": 0.18, "grad_norm": 2.381279510885023, "learning_rate": 1.8814205245501234e-06, "loss": 0.3223, "step": 642 }, { "epoch": 0.18, "grad_norm": 2.6425786131254654, "learning_rate": 1.880986651814357e-06, "loss": 0.2965, "step": 643 }, { "epoch": 0.18, "grad_norm": 2.608150395709567, "learning_rate": 1.8805520369908705e-06, "loss": 0.3117, "step": 644 }, { "epoch": 0.18, "grad_norm": 2.528769452573672, "learning_rate": 1.8801166804457568e-06, "loss": 0.3365, "step": 645 }, { "epoch": 0.18, "grad_norm": 2.593082547173156, "learning_rate": 1.879680582545732e-06, "loss": 0.3417, "step": 646 }, { "epoch": 0.18, "grad_norm": 2.235518751649165, "learning_rate": 1.879243743658138e-06, "loss": 0.2968, "step": 647 }, { "epoch": 0.18, "grad_norm": 2.512840027266576, "learning_rate": 1.8788061641509398e-06, "loss": 0.331, "step": 648 }, { "epoch": 0.18, "grad_norm": 2.623875669173926, "learning_rate": 1.878367844392728e-06, "loss": 0.3278, "step": 649 }, { "epoch": 0.18, "grad_norm": 2.613013341049209, "learning_rate": 1.8779287847527146e-06, "loss": 0.3081, "step": 650 }, { "epoch": 0.18, "grad_norm": 2.3967165560552526, "learning_rate": 1.877488985600736e-06, "loss": 0.2915, "step": 651 }, { "epoch": 0.18, "grad_norm": 2.461584837861662, "learning_rate": 1.8770484473072517e-06, "loss": 0.3174, "step": 652 }, { "epoch": 0.19, "grad_norm": 2.5909734569213563, "learning_rate": 1.8766071702433427e-06, "loss": 0.3462, "step": 653 }, { "epoch": 0.19, "grad_norm": 2.3003987637657093, "learning_rate": 1.8761651547807142e-06, "loss": 0.2864, "step": 654 }, { "epoch": 0.19, "grad_norm": 2.3914632891284517, "learning_rate": 1.875722401291691e-06, "loss": 0.3229, "step": 655 }, { "epoch": 0.19, "grad_norm": 2.745394637477216, "learning_rate": 1.8752789101492214e-06, "loss": 0.3379, "step": 656 }, { "epoch": 0.19, "grad_norm": 2.637442893764372, "learning_rate": 1.8748346817268745e-06, "loss": 0.2811, "step": 657 }, { "epoch": 0.19, "grad_norm": 2.7011526479877643, "learning_rate": 1.87438971639884e-06, "loss": 0.3486, "step": 658 }, { "epoch": 0.19, "grad_norm": 2.37855781182186, "learning_rate": 1.8739440145399293e-06, "loss": 0.3502, "step": 659 }, { "epoch": 0.19, "grad_norm": 2.4520047039317117, "learning_rate": 1.873497576525573e-06, "loss": 0.3201, "step": 660 }, { "epoch": 0.19, "grad_norm": 2.634743516274746, "learning_rate": 1.873050402731822e-06, "loss": 0.32, "step": 661 }, { "epoch": 0.19, "grad_norm": 2.5434914673155924, "learning_rate": 1.8726024935353487e-06, "loss": 0.3073, "step": 662 }, { "epoch": 0.19, "grad_norm": 2.405470557161396, "learning_rate": 1.8721538493134425e-06, "loss": 0.3465, "step": 663 }, { "epoch": 0.19, "grad_norm": 2.5890725900604377, "learning_rate": 1.8717044704440137e-06, "loss": 0.3216, "step": 664 }, { "epoch": 0.19, "grad_norm": 2.6171547730436733, "learning_rate": 1.8712543573055903e-06, "loss": 0.3311, "step": 665 }, { "epoch": 0.19, "grad_norm": 2.6274679971113475, "learning_rate": 1.8708035102773196e-06, "loss": 0.3092, "step": 666 }, { "epoch": 0.19, "grad_norm": 3.0832936101998927, "learning_rate": 1.8703519297389667e-06, "loss": 0.3331, "step": 667 }, { "epoch": 0.19, "grad_norm": 2.370456160821706, "learning_rate": 1.8698996160709146e-06, "loss": 0.2874, "step": 668 }, { "epoch": 0.19, "grad_norm": 2.4773772224277013, "learning_rate": 1.8694465696541639e-06, "loss": 0.32, "step": 669 }, { "epoch": 0.19, "grad_norm": 2.4624072608294227, "learning_rate": 1.8689927908703322e-06, "loss": 0.3001, "step": 670 }, { "epoch": 0.19, "grad_norm": 2.4253560050409084, "learning_rate": 1.8685382801016547e-06, "loss": 0.3164, "step": 671 }, { "epoch": 0.19, "grad_norm": 2.5349779292713572, "learning_rate": 1.868083037730982e-06, "loss": 0.3408, "step": 672 }, { "epoch": 0.19, "grad_norm": 2.98022002253498, "learning_rate": 1.8676270641417821e-06, "loss": 0.3569, "step": 673 }, { "epoch": 0.19, "grad_norm": 2.4999640201078237, "learning_rate": 1.8671703597181383e-06, "loss": 0.326, "step": 674 }, { "epoch": 0.19, "grad_norm": 2.602274939593711, "learning_rate": 1.8667129248447497e-06, "loss": 0.3487, "step": 675 }, { "epoch": 0.19, "grad_norm": 2.650254061550966, "learning_rate": 1.8662547599069308e-06, "loss": 0.3179, "step": 676 }, { "epoch": 0.19, "grad_norm": 2.6539404492903316, "learning_rate": 1.8657958652906106e-06, "loss": 0.3066, "step": 677 }, { "epoch": 0.19, "grad_norm": 2.4099554187480887, "learning_rate": 1.8653362413823331e-06, "loss": 0.3084, "step": 678 }, { "epoch": 0.19, "grad_norm": 2.611577650482648, "learning_rate": 1.8648758885692569e-06, "loss": 0.3539, "step": 679 }, { "epoch": 0.19, "grad_norm": 2.2722703116941463, "learning_rate": 1.8644148072391537e-06, "loss": 0.3013, "step": 680 }, { "epoch": 0.19, "grad_norm": 2.641577905268663, "learning_rate": 1.86395299778041e-06, "loss": 0.3231, "step": 681 }, { "epoch": 0.19, "grad_norm": 3.2759503960375165, "learning_rate": 1.8634904605820244e-06, "loss": 0.3255, "step": 682 }, { "epoch": 0.19, "grad_norm": 2.4768927319494694, "learning_rate": 1.8630271960336096e-06, "loss": 0.3297, "step": 683 }, { "epoch": 0.19, "grad_norm": 2.5489379391148748, "learning_rate": 1.8625632045253905e-06, "loss": 0.3336, "step": 684 }, { "epoch": 0.19, "grad_norm": 2.67838618655016, "learning_rate": 1.8620984864482042e-06, "loss": 0.3079, "step": 685 }, { "epoch": 0.19, "grad_norm": 2.586620003188253, "learning_rate": 1.8616330421935001e-06, "loss": 0.3386, "step": 686 }, { "epoch": 0.19, "grad_norm": 2.339499160262079, "learning_rate": 1.861166872153339e-06, "loss": 0.3196, "step": 687 }, { "epoch": 0.19, "grad_norm": 2.3721624029710053, "learning_rate": 1.860699976720393e-06, "loss": 0.3084, "step": 688 }, { "epoch": 0.2, "grad_norm": 2.4086625407569775, "learning_rate": 1.8602323562879461e-06, "loss": 0.3253, "step": 689 }, { "epoch": 0.2, "grad_norm": 2.599196154030309, "learning_rate": 1.8597640112498914e-06, "loss": 0.3298, "step": 690 }, { "epoch": 0.2, "grad_norm": 2.439632928413326, "learning_rate": 1.859294942000734e-06, "loss": 0.3522, "step": 691 }, { "epoch": 0.2, "grad_norm": 2.2742388509467504, "learning_rate": 1.8588251489355882e-06, "loss": 0.3085, "step": 692 }, { "epoch": 0.2, "grad_norm": 2.6673271769516163, "learning_rate": 1.8583546324501781e-06, "loss": 0.2883, "step": 693 }, { "epoch": 0.2, "grad_norm": 2.517797219208855, "learning_rate": 1.857883392940837e-06, "loss": 0.3105, "step": 694 }, { "epoch": 0.2, "grad_norm": 2.386077222115691, "learning_rate": 1.8574114308045074e-06, "loss": 0.3316, "step": 695 }, { "epoch": 0.2, "grad_norm": 2.628302002056542, "learning_rate": 1.856938746438741e-06, "loss": 0.328, "step": 696 }, { "epoch": 0.2, "grad_norm": 2.368441850241122, "learning_rate": 1.8564653402416968e-06, "loss": 0.2803, "step": 697 }, { "epoch": 0.2, "grad_norm": 2.4692437770109708, "learning_rate": 1.8559912126121424e-06, "loss": 0.3359, "step": 698 }, { "epoch": 0.2, "grad_norm": 2.5386494618133373, "learning_rate": 1.8555163639494534e-06, "loss": 0.3291, "step": 699 }, { "epoch": 0.2, "grad_norm": 9.033454716840346, "learning_rate": 1.8550407946536123e-06, "loss": 0.3158, "step": 700 }, { "epoch": 0.2, "grad_norm": 2.5159179694463285, "learning_rate": 1.854564505125209e-06, "loss": 0.3187, "step": 701 }, { "epoch": 0.2, "grad_norm": 2.5230000058408466, "learning_rate": 1.8540874957654396e-06, "loss": 0.3, "step": 702 }, { "epoch": 0.2, "grad_norm": 2.7927177428869077, "learning_rate": 1.8536097669761064e-06, "loss": 0.3026, "step": 703 }, { "epoch": 0.2, "grad_norm": 2.4839005856698795, "learning_rate": 1.8531313191596186e-06, "loss": 0.3017, "step": 704 }, { "epoch": 0.2, "grad_norm": 2.800891099137517, "learning_rate": 1.8526521527189903e-06, "loss": 0.3359, "step": 705 }, { "epoch": 0.2, "grad_norm": 2.2846064092871186, "learning_rate": 1.8521722680578411e-06, "loss": 0.3122, "step": 706 }, { "epoch": 0.2, "grad_norm": 2.233088715040393, "learning_rate": 1.851691665580396e-06, "loss": 0.2861, "step": 707 }, { "epoch": 0.2, "grad_norm": 2.538036700824122, "learning_rate": 1.851210345691484e-06, "loss": 0.3084, "step": 708 }, { "epoch": 0.2, "grad_norm": 2.45913218863696, "learning_rate": 1.8507283087965387e-06, "loss": 0.3205, "step": 709 }, { "epoch": 0.2, "grad_norm": 2.5787057825261046, "learning_rate": 1.8502455553015976e-06, "loss": 0.3406, "step": 710 }, { "epoch": 0.2, "grad_norm": 2.8389237764621162, "learning_rate": 1.8497620856133019e-06, "loss": 0.352, "step": 711 }, { "epoch": 0.2, "grad_norm": 2.4819341553311967, "learning_rate": 1.8492779001388964e-06, "loss": 0.3211, "step": 712 }, { "epoch": 0.2, "grad_norm": 2.489319817396486, "learning_rate": 1.848792999286228e-06, "loss": 0.3089, "step": 713 }, { "epoch": 0.2, "grad_norm": 2.5388004718059367, "learning_rate": 1.8483073834637467e-06, "loss": 0.3115, "step": 714 }, { "epoch": 0.2, "grad_norm": 2.7326112989715554, "learning_rate": 1.847821053080505e-06, "loss": 0.3376, "step": 715 }, { "epoch": 0.2, "grad_norm": 2.5047349342201994, "learning_rate": 1.8473340085461567e-06, "loss": 0.3138, "step": 716 }, { "epoch": 0.2, "grad_norm": 2.804498503318447, "learning_rate": 1.8468462502709577e-06, "loss": 0.35, "step": 717 }, { "epoch": 0.2, "grad_norm": 2.439841721380218, "learning_rate": 1.8463577786657649e-06, "loss": 0.3395, "step": 718 }, { "epoch": 0.2, "grad_norm": 2.3401956581086893, "learning_rate": 1.8458685941420358e-06, "loss": 0.2785, "step": 719 }, { "epoch": 0.2, "grad_norm": 2.394895809355412, "learning_rate": 1.8453786971118287e-06, "loss": 0.3223, "step": 720 }, { "epoch": 0.2, "grad_norm": 2.4660553191758, "learning_rate": 1.8448880879878024e-06, "loss": 0.3619, "step": 721 }, { "epoch": 0.2, "grad_norm": 2.301475320635097, "learning_rate": 1.8443967671832148e-06, "loss": 0.2969, "step": 722 }, { "epoch": 0.2, "grad_norm": 3.0721556769159895, "learning_rate": 1.843904735111924e-06, "loss": 0.3455, "step": 723 }, { "epoch": 0.21, "grad_norm": 2.1798953175310274, "learning_rate": 1.8434119921883861e-06, "loss": 0.2835, "step": 724 }, { "epoch": 0.21, "grad_norm": 2.3614911044919316, "learning_rate": 1.8429185388276576e-06, "loss": 0.3089, "step": 725 }, { "epoch": 0.21, "grad_norm": 2.6367517629764587, "learning_rate": 1.8424243754453919e-06, "loss": 0.3234, "step": 726 }, { "epoch": 0.21, "grad_norm": 2.4323439863930796, "learning_rate": 1.8419295024578416e-06, "loss": 0.3071, "step": 727 }, { "epoch": 0.21, "grad_norm": 2.636030318990423, "learning_rate": 1.8414339202818562e-06, "loss": 0.3645, "step": 728 }, { "epoch": 0.21, "grad_norm": 2.5239856793906656, "learning_rate": 1.8409376293348834e-06, "loss": 0.299, "step": 729 }, { "epoch": 0.21, "grad_norm": 2.3291366949117034, "learning_rate": 1.840440630034967e-06, "loss": 0.3238, "step": 730 }, { "epoch": 0.21, "grad_norm": 2.3344355021750274, "learning_rate": 1.8399429228007483e-06, "loss": 0.2983, "step": 731 }, { "epoch": 0.21, "grad_norm": 2.5558353060456525, "learning_rate": 1.8394445080514642e-06, "loss": 0.2869, "step": 732 }, { "epoch": 0.21, "grad_norm": 2.4247971738192935, "learning_rate": 1.838945386206948e-06, "loss": 0.292, "step": 733 }, { "epoch": 0.21, "grad_norm": 2.5825090055477484, "learning_rate": 1.8384455576876288e-06, "loss": 0.3063, "step": 734 }, { "epoch": 0.21, "grad_norm": 2.296180432216192, "learning_rate": 1.8379450229145305e-06, "loss": 0.3052, "step": 735 }, { "epoch": 0.21, "grad_norm": 2.358017059445579, "learning_rate": 1.8374437823092722e-06, "loss": 0.259, "step": 736 }, { "epoch": 0.21, "grad_norm": 2.3676231371995446, "learning_rate": 1.8369418362940673e-06, "loss": 0.3186, "step": 737 }, { "epoch": 0.21, "grad_norm": 2.220845042724821, "learning_rate": 1.8364391852917235e-06, "loss": 0.3032, "step": 738 }, { "epoch": 0.21, "grad_norm": 2.726016115467376, "learning_rate": 1.8359358297256427e-06, "loss": 0.3386, "step": 739 }, { "epoch": 0.21, "grad_norm": 2.5838509942685897, "learning_rate": 1.8354317700198196e-06, "loss": 0.327, "step": 740 }, { "epoch": 0.21, "grad_norm": 2.6091104337763684, "learning_rate": 1.8349270065988427e-06, "loss": 0.3253, "step": 741 }, { "epoch": 0.21, "grad_norm": 2.714238298218788, "learning_rate": 1.8344215398878924e-06, "loss": 0.2946, "step": 742 }, { "epoch": 0.21, "grad_norm": 2.380436878366774, "learning_rate": 1.8339153703127428e-06, "loss": 0.3225, "step": 743 }, { "epoch": 0.21, "grad_norm": 2.2474161169587887, "learning_rate": 1.8334084982997586e-06, "loss": 0.2879, "step": 744 }, { "epoch": 0.21, "grad_norm": 2.4824008936563606, "learning_rate": 1.8329009242758975e-06, "loss": 0.3321, "step": 745 }, { "epoch": 0.21, "grad_norm": 2.387839320021625, "learning_rate": 1.8323926486687073e-06, "loss": 0.3115, "step": 746 }, { "epoch": 0.21, "grad_norm": 2.4975073563611065, "learning_rate": 1.8318836719063277e-06, "loss": 0.2931, "step": 747 }, { "epoch": 0.21, "grad_norm": 2.6069296054765014, "learning_rate": 1.831373994417489e-06, "loss": 0.3373, "step": 748 }, { "epoch": 0.21, "grad_norm": 2.6304583634758507, "learning_rate": 1.830863616631511e-06, "loss": 0.3244, "step": 749 }, { "epoch": 0.21, "grad_norm": 2.4945219891074273, "learning_rate": 1.830352538978304e-06, "loss": 0.3045, "step": 750 }, { "epoch": 0.21, "grad_norm": 2.5943190229679503, "learning_rate": 1.8298407618883677e-06, "loss": 0.3132, "step": 751 }, { "epoch": 0.21, "grad_norm": 2.451462512381113, "learning_rate": 1.8293282857927909e-06, "loss": 0.3306, "step": 752 }, { "epoch": 0.21, "grad_norm": 2.802381941279302, "learning_rate": 1.828815111123251e-06, "loss": 0.3325, "step": 753 }, { "epoch": 0.21, "grad_norm": 2.2958989566865715, "learning_rate": 1.8283012383120145e-06, "loss": 0.2997, "step": 754 }, { "epoch": 0.21, "grad_norm": 2.4742741886127257, "learning_rate": 1.827786667791935e-06, "loss": 0.3466, "step": 755 }, { "epoch": 0.21, "grad_norm": 2.3330203880683333, "learning_rate": 1.8272713999964546e-06, "loss": 0.2964, "step": 756 }, { "epoch": 0.21, "grad_norm": 2.372071762014812, "learning_rate": 1.8267554353596024e-06, "loss": 0.3035, "step": 757 }, { "epoch": 0.21, "grad_norm": 2.54065702879972, "learning_rate": 1.8262387743159948e-06, "loss": 0.3268, "step": 758 }, { "epoch": 0.22, "grad_norm": 2.3382390623791394, "learning_rate": 1.8257214173008344e-06, "loss": 0.3051, "step": 759 }, { "epoch": 0.22, "grad_norm": 2.4031055066754603, "learning_rate": 1.8252033647499099e-06, "loss": 0.3317, "step": 760 }, { "epoch": 0.22, "grad_norm": 2.4032306983074845, "learning_rate": 1.8246846170995961e-06, "loss": 0.3015, "step": 761 }, { "epoch": 0.22, "grad_norm": 3.1664023828151944, "learning_rate": 1.8241651747868541e-06, "loss": 0.3408, "step": 762 }, { "epoch": 0.22, "grad_norm": 2.4245747041798946, "learning_rate": 1.823645038249229e-06, "loss": 0.3301, "step": 763 }, { "epoch": 0.22, "grad_norm": 2.307622301322068, "learning_rate": 1.823124207924851e-06, "loss": 0.3273, "step": 764 }, { "epoch": 0.22, "grad_norm": 2.300710489645234, "learning_rate": 1.822602684252435e-06, "loss": 0.2982, "step": 765 }, { "epoch": 0.22, "grad_norm": 2.548236181983721, "learning_rate": 1.8220804676712794e-06, "loss": 0.3127, "step": 766 }, { "epoch": 0.22, "grad_norm": 2.405523364038282, "learning_rate": 1.8215575586212667e-06, "loss": 0.3216, "step": 767 }, { "epoch": 0.22, "grad_norm": 2.290131171904868, "learning_rate": 1.821033957542863e-06, "loss": 0.2858, "step": 768 }, { "epoch": 0.22, "grad_norm": 2.408680649010481, "learning_rate": 1.8205096648771163e-06, "loss": 0.3249, "step": 769 }, { "epoch": 0.22, "grad_norm": 2.194918371382472, "learning_rate": 1.8199846810656583e-06, "loss": 0.2824, "step": 770 }, { "epoch": 0.22, "grad_norm": 3.2665129154156993, "learning_rate": 1.819459006550702e-06, "loss": 0.3252, "step": 771 }, { "epoch": 0.22, "grad_norm": 2.4877798522357626, "learning_rate": 1.8189326417750426e-06, "loss": 0.2929, "step": 772 }, { "epoch": 0.22, "grad_norm": 2.4438405001800634, "learning_rate": 1.8184055871820565e-06, "loss": 0.3092, "step": 773 }, { "epoch": 0.22, "grad_norm": 2.471072912113562, "learning_rate": 1.8178778432157014e-06, "loss": 0.3262, "step": 774 }, { "epoch": 0.22, "grad_norm": 2.520311374184934, "learning_rate": 1.8173494103205158e-06, "loss": 0.2878, "step": 775 }, { "epoch": 0.22, "grad_norm": 2.4873941242668485, "learning_rate": 1.8168202889416182e-06, "loss": 0.2937, "step": 776 }, { "epoch": 0.22, "grad_norm": 2.4169244067397506, "learning_rate": 1.8162904795247074e-06, "loss": 0.3231, "step": 777 }, { "epoch": 0.22, "grad_norm": 2.4212965608339387, "learning_rate": 1.8157599825160607e-06, "loss": 0.3153, "step": 778 }, { "epoch": 0.22, "grad_norm": 2.58524729253949, "learning_rate": 1.8152287983625365e-06, "loss": 0.3141, "step": 779 }, { "epoch": 0.22, "grad_norm": 2.648579898819257, "learning_rate": 1.8146969275115701e-06, "loss": 0.3006, "step": 780 }, { "epoch": 0.22, "grad_norm": 2.4588512176786788, "learning_rate": 1.8141643704111767e-06, "loss": 0.3369, "step": 781 }, { "epoch": 0.22, "grad_norm": 3.388867011332001, "learning_rate": 1.8136311275099484e-06, "loss": 0.3207, "step": 782 }, { "epoch": 0.22, "grad_norm": 2.3763493849662827, "learning_rate": 1.8130971992570552e-06, "loss": 0.3433, "step": 783 }, { "epoch": 0.22, "grad_norm": 2.3814908591670236, "learning_rate": 1.8125625861022454e-06, "loss": 0.3218, "step": 784 }, { "epoch": 0.22, "grad_norm": 2.2169228847674773, "learning_rate": 1.812027288495843e-06, "loss": 0.2853, "step": 785 }, { "epoch": 0.22, "grad_norm": 2.5600068690152775, "learning_rate": 1.8114913068887493e-06, "loss": 0.3017, "step": 786 }, { "epoch": 0.22, "grad_norm": 2.3564897528452327, "learning_rate": 1.810954641732441e-06, "loss": 0.3025, "step": 787 }, { "epoch": 0.22, "grad_norm": 2.7185301584909722, "learning_rate": 1.8104172934789715e-06, "loss": 0.3002, "step": 788 }, { "epoch": 0.22, "grad_norm": 2.394601217070245, "learning_rate": 1.8098792625809689e-06, "loss": 0.3127, "step": 789 }, { "epoch": 0.22, "grad_norm": 2.919708059528672, "learning_rate": 1.8093405494916372e-06, "loss": 0.3342, "step": 790 }, { "epoch": 0.22, "grad_norm": 2.5018439752505226, "learning_rate": 1.8088011546647533e-06, "loss": 0.3036, "step": 791 }, { "epoch": 0.22, "grad_norm": 3.0270280866919808, "learning_rate": 1.8082610785546706e-06, "loss": 0.3237, "step": 792 }, { "epoch": 0.22, "grad_norm": 2.5871694707198345, "learning_rate": 1.8077203216163143e-06, "loss": 0.3229, "step": 793 }, { "epoch": 0.22, "grad_norm": 2.229119769181344, "learning_rate": 1.8071788843051848e-06, "loss": 0.2927, "step": 794 }, { "epoch": 0.23, "grad_norm": 2.2796518406154402, "learning_rate": 1.806636767077354e-06, "loss": 0.3144, "step": 795 }, { "epoch": 0.23, "grad_norm": 2.4914832529542728, "learning_rate": 1.8060939703894682e-06, "loss": 0.3326, "step": 796 }, { "epoch": 0.23, "grad_norm": 2.3781369431977692, "learning_rate": 1.8055504946987447e-06, "loss": 0.3312, "step": 797 }, { "epoch": 0.23, "grad_norm": 2.6910642279410624, "learning_rate": 1.8050063404629732e-06, "loss": 0.3322, "step": 798 }, { "epoch": 0.23, "grad_norm": 2.5232510110504314, "learning_rate": 1.8044615081405151e-06, "loss": 0.2967, "step": 799 }, { "epoch": 0.23, "grad_norm": 2.377745599667623, "learning_rate": 1.8039159981903027e-06, "loss": 0.289, "step": 800 }, { "epoch": 0.23, "grad_norm": 2.354831110869895, "learning_rate": 1.8033698110718394e-06, "loss": 0.3094, "step": 801 }, { "epoch": 0.23, "grad_norm": 2.5266561213394017, "learning_rate": 1.802822947245199e-06, "loss": 0.3095, "step": 802 }, { "epoch": 0.23, "grad_norm": 2.5283457627776005, "learning_rate": 1.8022754071710252e-06, "loss": 0.3077, "step": 803 }, { "epoch": 0.23, "grad_norm": 2.507549698347303, "learning_rate": 1.8017271913105306e-06, "loss": 0.3138, "step": 804 }, { "epoch": 0.23, "grad_norm": 2.625393318911374, "learning_rate": 1.8011783001254988e-06, "loss": 0.3556, "step": 805 }, { "epoch": 0.23, "grad_norm": 2.5556149198490132, "learning_rate": 1.8006287340782805e-06, "loss": 0.2912, "step": 806 }, { "epoch": 0.23, "grad_norm": 2.698644031831514, "learning_rate": 1.8000784936317957e-06, "loss": 0.2907, "step": 807 }, { "epoch": 0.23, "grad_norm": 2.4656465189485064, "learning_rate": 1.7995275792495324e-06, "loss": 0.2793, "step": 808 }, { "epoch": 0.23, "grad_norm": 2.411006777212944, "learning_rate": 1.7989759913955463e-06, "loss": 0.3178, "step": 809 }, { "epoch": 0.23, "grad_norm": 2.4414409226078426, "learning_rate": 1.7984237305344601e-06, "loss": 0.3266, "step": 810 }, { "epoch": 0.23, "grad_norm": 2.4708127483607822, "learning_rate": 1.7978707971314636e-06, "loss": 0.3158, "step": 811 }, { "epoch": 0.23, "grad_norm": 2.843055534073164, "learning_rate": 1.7973171916523131e-06, "loss": 0.2984, "step": 812 }, { "epoch": 0.23, "grad_norm": 2.4782911871163895, "learning_rate": 1.796762914563331e-06, "loss": 0.3261, "step": 813 }, { "epoch": 0.23, "grad_norm": 2.559804468307838, "learning_rate": 1.7962079663314058e-06, "loss": 0.3027, "step": 814 }, { "epoch": 0.23, "grad_norm": 2.3474605814667004, "learning_rate": 1.7956523474239907e-06, "loss": 0.2762, "step": 815 }, { "epoch": 0.23, "grad_norm": 2.3212862035195525, "learning_rate": 1.7950960583091042e-06, "loss": 0.3334, "step": 816 }, { "epoch": 0.23, "grad_norm": 2.5219516348949926, "learning_rate": 1.794539099455329e-06, "loss": 0.3364, "step": 817 }, { "epoch": 0.23, "grad_norm": 2.2878948873749803, "learning_rate": 1.7939814713318122e-06, "loss": 0.3063, "step": 818 }, { "epoch": 0.23, "grad_norm": 2.7420790975532734, "learning_rate": 1.7934231744082649e-06, "loss": 0.3206, "step": 819 }, { "epoch": 0.23, "grad_norm": 2.3798967487705602, "learning_rate": 1.7928642091549612e-06, "loss": 0.2839, "step": 820 }, { "epoch": 0.23, "grad_norm": 2.593899829352958, "learning_rate": 1.7923045760427384e-06, "loss": 0.3206, "step": 821 }, { "epoch": 0.23, "grad_norm": 2.4143042994636197, "learning_rate": 1.791744275542996e-06, "loss": 0.3312, "step": 822 }, { "epoch": 0.23, "grad_norm": 2.7810821685273153, "learning_rate": 1.7911833081276958e-06, "loss": 0.335, "step": 823 }, { "epoch": 0.23, "grad_norm": 2.4949557139361316, "learning_rate": 1.7906216742693619e-06, "loss": 0.3108, "step": 824 }, { "epoch": 0.23, "grad_norm": 2.392727307987833, "learning_rate": 1.7900593744410789e-06, "loss": 0.2839, "step": 825 }, { "epoch": 0.23, "grad_norm": 2.454105906481232, "learning_rate": 1.7894964091164928e-06, "loss": 0.311, "step": 826 }, { "epoch": 0.23, "grad_norm": 2.3359712297438464, "learning_rate": 1.7889327787698103e-06, "loss": 0.3052, "step": 827 }, { "epoch": 0.23, "grad_norm": 2.3578175704575868, "learning_rate": 1.7883684838757982e-06, "loss": 0.2843, "step": 828 }, { "epoch": 0.23, "grad_norm": 2.3546703645487783, "learning_rate": 1.787803524909783e-06, "loss": 0.3116, "step": 829 }, { "epoch": 0.24, "grad_norm": 2.3540766735466523, "learning_rate": 1.7872379023476506e-06, "loss": 0.3029, "step": 830 }, { "epoch": 0.24, "grad_norm": 2.4561967275434085, "learning_rate": 1.7866716166658455e-06, "loss": 0.3164, "step": 831 }, { "epoch": 0.24, "grad_norm": 2.3804728957034103, "learning_rate": 1.7861046683413714e-06, "loss": 0.3422, "step": 832 }, { "epoch": 0.24, "grad_norm": 2.5253167982397517, "learning_rate": 1.78553705785179e-06, "loss": 0.3434, "step": 833 }, { "epoch": 0.24, "grad_norm": 2.4647339440108627, "learning_rate": 1.7849687856752206e-06, "loss": 0.3414, "step": 834 }, { "epoch": 0.24, "grad_norm": 2.3099561921610197, "learning_rate": 1.7843998522903401e-06, "loss": 0.287, "step": 835 }, { "epoch": 0.24, "grad_norm": 3.169187560233085, "learning_rate": 1.7838302581763815e-06, "loss": 0.3013, "step": 836 }, { "epoch": 0.24, "grad_norm": 2.4697580845690608, "learning_rate": 1.7832600038131358e-06, "loss": 0.3017, "step": 837 }, { "epoch": 0.24, "grad_norm": 2.270074862967853, "learning_rate": 1.782689089680949e-06, "loss": 0.312, "step": 838 }, { "epoch": 0.24, "grad_norm": 2.834517930537921, "learning_rate": 1.7821175162607234e-06, "loss": 0.3683, "step": 839 }, { "epoch": 0.24, "grad_norm": 2.3820423992048503, "learning_rate": 1.7815452840339162e-06, "loss": 0.3103, "step": 840 }, { "epoch": 0.24, "grad_norm": 4.134181679241243, "learning_rate": 1.7809723934825402e-06, "loss": 0.3091, "step": 841 }, { "epoch": 0.24, "grad_norm": 2.512480635786526, "learning_rate": 1.7803988450891626e-06, "loss": 0.3315, "step": 842 }, { "epoch": 0.24, "grad_norm": 2.3133473585428055, "learning_rate": 1.7798246393369033e-06, "loss": 0.3219, "step": 843 }, { "epoch": 0.24, "grad_norm": 2.337263848790207, "learning_rate": 1.7792497767094381e-06, "loss": 0.3013, "step": 844 }, { "epoch": 0.24, "grad_norm": 2.5037976885693185, "learning_rate": 1.7786742576909952e-06, "loss": 0.3077, "step": 845 }, { "epoch": 0.24, "grad_norm": 2.4742609810228995, "learning_rate": 1.778098082766355e-06, "loss": 0.3193, "step": 846 }, { "epoch": 0.24, "grad_norm": 2.4435964981750127, "learning_rate": 1.7775212524208512e-06, "loss": 0.3154, "step": 847 }, { "epoch": 0.24, "grad_norm": 2.350815158082994, "learning_rate": 1.7769437671403696e-06, "loss": 0.3046, "step": 848 }, { "epoch": 0.24, "grad_norm": 2.443996984457143, "learning_rate": 1.7763656274113473e-06, "loss": 0.3378, "step": 849 }, { "epoch": 0.24, "grad_norm": 2.4900482903481054, "learning_rate": 1.775786833720773e-06, "loss": 0.3177, "step": 850 }, { "epoch": 0.24, "grad_norm": 2.3204585475536352, "learning_rate": 1.7752073865561856e-06, "loss": 0.3452, "step": 851 }, { "epoch": 0.24, "grad_norm": 2.3326934912673125, "learning_rate": 1.7746272864056752e-06, "loss": 0.3015, "step": 852 }, { "epoch": 0.24, "grad_norm": 2.170865654693707, "learning_rate": 1.774046533757882e-06, "loss": 0.2929, "step": 853 }, { "epoch": 0.24, "grad_norm": 2.353397028015931, "learning_rate": 1.7734651291019953e-06, "loss": 0.2848, "step": 854 }, { "epoch": 0.24, "grad_norm": 2.3162206156297156, "learning_rate": 1.7728830729277537e-06, "loss": 0.2981, "step": 855 }, { "epoch": 0.24, "grad_norm": 2.440390255244275, "learning_rate": 1.7723003657254444e-06, "loss": 0.3082, "step": 856 }, { "epoch": 0.24, "grad_norm": 2.484031656364762, "learning_rate": 1.7717170079859039e-06, "loss": 0.2997, "step": 857 }, { "epoch": 0.24, "grad_norm": 2.4625882477489385, "learning_rate": 1.7711330002005155e-06, "loss": 0.3039, "step": 858 }, { "epoch": 0.24, "grad_norm": 2.2641087407974614, "learning_rate": 1.770548342861211e-06, "loss": 0.2964, "step": 859 }, { "epoch": 0.24, "grad_norm": 2.4624242786779154, "learning_rate": 1.7699630364604687e-06, "loss": 0.3124, "step": 860 }, { "epoch": 0.24, "grad_norm": 2.4068039255630387, "learning_rate": 1.769377081491314e-06, "loss": 0.3017, "step": 861 }, { "epoch": 0.24, "grad_norm": 2.5898207729162475, "learning_rate": 1.7687904784473186e-06, "loss": 0.3304, "step": 862 }, { "epoch": 0.24, "grad_norm": 2.4943816806666663, "learning_rate": 1.7682032278226e-06, "loss": 0.301, "step": 863 }, { "epoch": 0.24, "grad_norm": 2.2735550324194502, "learning_rate": 1.7676153301118206e-06, "loss": 0.2942, "step": 864 }, { "epoch": 0.25, "grad_norm": 2.542751669795743, "learning_rate": 1.7670267858101892e-06, "loss": 0.3213, "step": 865 }, { "epoch": 0.25, "grad_norm": 2.6233292164559674, "learning_rate": 1.7664375954134582e-06, "loss": 0.3246, "step": 866 }, { "epoch": 0.25, "grad_norm": 2.406407165535525, "learning_rate": 1.7658477594179244e-06, "loss": 0.2849, "step": 867 }, { "epoch": 0.25, "grad_norm": 2.2742014416952006, "learning_rate": 1.7652572783204284e-06, "loss": 0.3338, "step": 868 }, { "epoch": 0.25, "grad_norm": 2.3357411639986996, "learning_rate": 1.7646661526183549e-06, "loss": 0.304, "step": 869 }, { "epoch": 0.25, "grad_norm": 2.3725262940496337, "learning_rate": 1.7640743828096305e-06, "loss": 0.2939, "step": 870 }, { "epoch": 0.25, "grad_norm": 2.2632492249675016, "learning_rate": 1.7634819693927252e-06, "loss": 0.3038, "step": 871 }, { "epoch": 0.25, "grad_norm": 2.367959700282048, "learning_rate": 1.7628889128666501e-06, "loss": 0.313, "step": 872 }, { "epoch": 0.25, "grad_norm": 2.6123170953069517, "learning_rate": 1.7622952137309595e-06, "loss": 0.3111, "step": 873 }, { "epoch": 0.25, "grad_norm": 2.419626125941722, "learning_rate": 1.7617008724857478e-06, "loss": 0.2916, "step": 874 }, { "epoch": 0.25, "grad_norm": 2.38908803713079, "learning_rate": 1.7611058896316507e-06, "loss": 0.3049, "step": 875 }, { "epoch": 0.25, "grad_norm": 2.5552448508204053, "learning_rate": 1.7605102656698442e-06, "loss": 0.2781, "step": 876 }, { "epoch": 0.25, "grad_norm": 2.4428995873799524, "learning_rate": 1.7599140011020448e-06, "loss": 0.3321, "step": 877 }, { "epoch": 0.25, "grad_norm": 2.4239545273760847, "learning_rate": 1.7593170964305077e-06, "loss": 0.3077, "step": 878 }, { "epoch": 0.25, "grad_norm": 2.1915135477858714, "learning_rate": 1.7587195521580288e-06, "loss": 0.2939, "step": 879 }, { "epoch": 0.25, "grad_norm": 2.4431326002218743, "learning_rate": 1.7581213687879405e-06, "loss": 0.3382, "step": 880 }, { "epoch": 0.25, "grad_norm": 2.586725605255059, "learning_rate": 1.757522546824116e-06, "loss": 0.3251, "step": 881 }, { "epoch": 0.25, "grad_norm": 2.5557207299582285, "learning_rate": 1.7569230867709645e-06, "loss": 0.33, "step": 882 }, { "epoch": 0.25, "grad_norm": 2.5666833510025726, "learning_rate": 1.7563229891334336e-06, "loss": 0.3349, "step": 883 }, { "epoch": 0.25, "grad_norm": 2.2615897690980273, "learning_rate": 1.7557222544170081e-06, "loss": 0.3446, "step": 884 }, { "epoch": 0.25, "grad_norm": 2.192243862822083, "learning_rate": 1.7551208831277088e-06, "loss": 0.3127, "step": 885 }, { "epoch": 0.25, "grad_norm": 2.5533807430869837, "learning_rate": 1.754518875772093e-06, "loss": 0.2985, "step": 886 }, { "epoch": 0.25, "grad_norm": 2.6127071592959825, "learning_rate": 1.7539162328572542e-06, "loss": 0.3152, "step": 887 }, { "epoch": 0.25, "grad_norm": 2.24436359208922, "learning_rate": 1.7533129548908203e-06, "loss": 0.2826, "step": 888 }, { "epoch": 0.25, "grad_norm": 2.21650724397911, "learning_rate": 1.752709042380955e-06, "loss": 0.3139, "step": 889 }, { "epoch": 0.25, "grad_norm": 2.240179325579028, "learning_rate": 1.7521044958363565e-06, "loss": 0.3201, "step": 890 }, { "epoch": 0.25, "grad_norm": 2.6497870152313414, "learning_rate": 1.7514993157662561e-06, "loss": 0.3042, "step": 891 }, { "epoch": 0.25, "grad_norm": 2.4528906426154102, "learning_rate": 1.75089350268042e-06, "loss": 0.2924, "step": 892 }, { "epoch": 0.25, "grad_norm": 2.3758599539597336, "learning_rate": 1.7502870570891468e-06, "loss": 0.3152, "step": 893 }, { "epoch": 0.25, "grad_norm": 2.3761524990008973, "learning_rate": 1.749679979503268e-06, "loss": 0.3026, "step": 894 }, { "epoch": 0.25, "grad_norm": 2.438579519539714, "learning_rate": 1.749072270434148e-06, "loss": 0.3014, "step": 895 }, { "epoch": 0.25, "grad_norm": 2.2771364984262332, "learning_rate": 1.7484639303936822e-06, "loss": 0.2879, "step": 896 }, { "epoch": 0.25, "grad_norm": 2.699643333300088, "learning_rate": 1.747854959894298e-06, "loss": 0.3181, "step": 897 }, { "epoch": 0.25, "grad_norm": 2.3392193675944903, "learning_rate": 1.7472453594489538e-06, "loss": 0.307, "step": 898 }, { "epoch": 0.25, "grad_norm": 2.370792659232408, "learning_rate": 1.746635129571139e-06, "loss": 0.2849, "step": 899 }, { "epoch": 0.26, "grad_norm": 3.4045887373466757, "learning_rate": 1.7460242707748728e-06, "loss": 0.3112, "step": 900 }, { "epoch": 0.26, "grad_norm": 2.1885494563042704, "learning_rate": 1.7454127835747037e-06, "loss": 0.3049, "step": 901 }, { "epoch": 0.26, "grad_norm": 2.719256164872196, "learning_rate": 1.7448006684857106e-06, "loss": 0.2924, "step": 902 }, { "epoch": 0.26, "grad_norm": 2.3275588349254432, "learning_rate": 1.7441879260235007e-06, "loss": 0.2916, "step": 903 }, { "epoch": 0.26, "grad_norm": 2.6433671842254447, "learning_rate": 1.7435745567042094e-06, "loss": 0.3004, "step": 904 }, { "epoch": 0.26, "grad_norm": 2.3788139054652286, "learning_rate": 1.7429605610445004e-06, "loss": 0.2855, "step": 905 }, { "epoch": 0.26, "grad_norm": 2.3669128953610983, "learning_rate": 1.7423459395615652e-06, "loss": 0.3005, "step": 906 }, { "epoch": 0.26, "grad_norm": 2.3977847313289256, "learning_rate": 1.7417306927731223e-06, "loss": 0.3056, "step": 907 }, { "epoch": 0.26, "grad_norm": 2.634258954737695, "learning_rate": 1.7411148211974167e-06, "loss": 0.2984, "step": 908 }, { "epoch": 0.26, "grad_norm": 2.549418667244922, "learning_rate": 1.7404983253532202e-06, "loss": 0.3313, "step": 909 }, { "epoch": 0.26, "grad_norm": 2.229251686531007, "learning_rate": 1.7398812057598298e-06, "loss": 0.2907, "step": 910 }, { "epoch": 0.26, "grad_norm": 2.363913384681548, "learning_rate": 1.7392634629370681e-06, "loss": 0.3069, "step": 911 }, { "epoch": 0.26, "grad_norm": 2.635330690255088, "learning_rate": 1.7386450974052832e-06, "loss": 0.3306, "step": 912 }, { "epoch": 0.26, "grad_norm": 2.454322989683554, "learning_rate": 1.738026109685347e-06, "loss": 0.3071, "step": 913 }, { "epoch": 0.26, "grad_norm": 2.559993279354754, "learning_rate": 1.7374065002986557e-06, "loss": 0.3025, "step": 914 }, { "epoch": 0.26, "grad_norm": 2.3497500860175626, "learning_rate": 1.7367862697671299e-06, "loss": 0.3064, "step": 915 }, { "epoch": 0.26, "grad_norm": 2.5312837946915137, "learning_rate": 1.7361654186132117e-06, "loss": 0.3307, "step": 916 }, { "epoch": 0.26, "grad_norm": 2.3373983973812016, "learning_rate": 1.735543947359868e-06, "loss": 0.2739, "step": 917 }, { "epoch": 0.26, "grad_norm": 2.4244408158663995, "learning_rate": 1.7349218565305867e-06, "loss": 0.2939, "step": 918 }, { "epoch": 0.26, "grad_norm": 2.4878585942575686, "learning_rate": 1.7342991466493784e-06, "loss": 0.2913, "step": 919 }, { "epoch": 0.26, "grad_norm": 2.288423305700923, "learning_rate": 1.7336758182407737e-06, "loss": 0.3096, "step": 920 }, { "epoch": 0.26, "grad_norm": 2.502154794481118, "learning_rate": 1.733051871829826e-06, "loss": 0.3025, "step": 921 }, { "epoch": 0.26, "grad_norm": 2.478412947406026, "learning_rate": 1.7324273079421088e-06, "loss": 0.3002, "step": 922 }, { "epoch": 0.26, "grad_norm": 2.6007090817241703, "learning_rate": 1.7318021271037146e-06, "loss": 0.3246, "step": 923 }, { "epoch": 0.26, "grad_norm": 2.1730136205293205, "learning_rate": 1.7311763298412569e-06, "loss": 0.309, "step": 924 }, { "epoch": 0.26, "grad_norm": 2.328540172131703, "learning_rate": 1.7305499166818679e-06, "loss": 0.3198, "step": 925 }, { "epoch": 0.26, "grad_norm": 2.3787220615358797, "learning_rate": 1.7299228881531982e-06, "loss": 0.2649, "step": 926 }, { "epoch": 0.26, "grad_norm": 2.522766022357985, "learning_rate": 1.729295244783418e-06, "loss": 0.3404, "step": 927 }, { "epoch": 0.26, "grad_norm": 2.235899600554452, "learning_rate": 1.7286669871012135e-06, "loss": 0.3235, "step": 928 }, { "epoch": 0.26, "grad_norm": 3.0892728394383817, "learning_rate": 1.7280381156357904e-06, "loss": 0.2768, "step": 929 }, { "epoch": 0.26, "grad_norm": 2.5467713290166145, "learning_rate": 1.7274086309168701e-06, "loss": 0.2989, "step": 930 }, { "epoch": 0.26, "grad_norm": 2.2375743713758394, "learning_rate": 1.7267785334746907e-06, "loss": 0.2926, "step": 931 }, { "epoch": 0.26, "grad_norm": 2.392561762822056, "learning_rate": 1.7261478238400068e-06, "loss": 0.2985, "step": 932 }, { "epoch": 0.26, "grad_norm": 2.4450577681450616, "learning_rate": 1.725516502544089e-06, "loss": 0.3386, "step": 933 }, { "epoch": 0.26, "grad_norm": 2.2772591477186124, "learning_rate": 1.7248845701187218e-06, "loss": 0.2864, "step": 934 }, { "epoch": 0.26, "grad_norm": 2.616420026072073, "learning_rate": 1.7242520270962057e-06, "loss": 0.333, "step": 935 }, { "epoch": 0.27, "grad_norm": 2.5407530283716717, "learning_rate": 1.7236188740093554e-06, "loss": 0.3247, "step": 936 }, { "epoch": 0.27, "grad_norm": 2.837835871101234, "learning_rate": 1.7229851113914986e-06, "loss": 0.3547, "step": 937 }, { "epoch": 0.27, "grad_norm": 2.438027917952889, "learning_rate": 1.7223507397764778e-06, "loss": 0.3063, "step": 938 }, { "epoch": 0.27, "grad_norm": 2.2202556858343114, "learning_rate": 1.721715759698647e-06, "loss": 0.2684, "step": 939 }, { "epoch": 0.27, "grad_norm": 2.4880759861493327, "learning_rate": 1.721080171692874e-06, "loss": 0.3131, "step": 940 }, { "epoch": 0.27, "grad_norm": 2.4325603890985956, "learning_rate": 1.720443976294538e-06, "loss": 0.3066, "step": 941 }, { "epoch": 0.27, "grad_norm": 2.598743028566477, "learning_rate": 1.7198071740395298e-06, "loss": 0.3196, "step": 942 }, { "epoch": 0.27, "grad_norm": 2.447618969814215, "learning_rate": 1.7191697654642515e-06, "loss": 0.3208, "step": 943 }, { "epoch": 0.27, "grad_norm": 2.398233275246693, "learning_rate": 1.7185317511056163e-06, "loss": 0.2895, "step": 944 }, { "epoch": 0.27, "grad_norm": 2.455998669650308, "learning_rate": 1.717893131501047e-06, "loss": 0.3088, "step": 945 }, { "epoch": 0.27, "grad_norm": 2.55553833781246, "learning_rate": 1.717253907188477e-06, "loss": 0.311, "step": 946 }, { "epoch": 0.27, "grad_norm": 2.331976956659491, "learning_rate": 1.7166140787063484e-06, "loss": 0.2975, "step": 947 }, { "epoch": 0.27, "grad_norm": 2.5443708751804843, "learning_rate": 1.7159736465936122e-06, "loss": 0.3312, "step": 948 }, { "epoch": 0.27, "grad_norm": 2.4646620518131765, "learning_rate": 1.7153326113897285e-06, "loss": 0.2983, "step": 949 }, { "epoch": 0.27, "grad_norm": 2.700520256461987, "learning_rate": 1.7146909736346649e-06, "loss": 0.3422, "step": 950 }, { "epoch": 0.27, "grad_norm": 2.365208695404597, "learning_rate": 1.7140487338688964e-06, "loss": 0.3125, "step": 951 }, { "epoch": 0.27, "grad_norm": 2.342799021500254, "learning_rate": 1.7134058926334061e-06, "loss": 0.2851, "step": 952 }, { "epoch": 0.27, "grad_norm": 2.472473882462489, "learning_rate": 1.712762450469682e-06, "loss": 0.2833, "step": 953 }, { "epoch": 0.27, "grad_norm": 2.924949206264284, "learning_rate": 1.7121184079197199e-06, "loss": 0.2867, "step": 954 }, { "epoch": 0.27, "grad_norm": 2.446605549606646, "learning_rate": 1.7114737655260209e-06, "loss": 0.3125, "step": 955 }, { "epoch": 0.27, "grad_norm": 2.4578380201775114, "learning_rate": 1.710828523831591e-06, "loss": 0.304, "step": 956 }, { "epoch": 0.27, "grad_norm": 2.278217100685124, "learning_rate": 1.7101826833799408e-06, "loss": 0.3097, "step": 957 }, { "epoch": 0.27, "grad_norm": 2.634826805932555, "learning_rate": 1.7095362447150863e-06, "loss": 0.2935, "step": 958 }, { "epoch": 0.27, "grad_norm": 3.378656134708543, "learning_rate": 1.708889208381546e-06, "loss": 0.2915, "step": 959 }, { "epoch": 0.27, "grad_norm": 2.4000239757418482, "learning_rate": 1.7082415749243434e-06, "loss": 0.2999, "step": 960 }, { "epoch": 0.27, "grad_norm": 2.2573289490776585, "learning_rate": 1.7075933448890036e-06, "loss": 0.277, "step": 961 }, { "epoch": 0.27, "grad_norm": 2.346166395763324, "learning_rate": 1.706944518821555e-06, "loss": 0.3021, "step": 962 }, { "epoch": 0.27, "grad_norm": 2.4124728615991455, "learning_rate": 1.7062950972685276e-06, "loss": 0.3128, "step": 963 }, { "epoch": 0.27, "grad_norm": 2.826711761487159, "learning_rate": 1.705645080776954e-06, "loss": 0.313, "step": 964 }, { "epoch": 0.27, "grad_norm": 2.4047963495047906, "learning_rate": 1.7049944698943666e-06, "loss": 0.3371, "step": 965 }, { "epoch": 0.27, "grad_norm": 2.389614674773777, "learning_rate": 1.7043432651687985e-06, "loss": 0.3249, "step": 966 }, { "epoch": 0.27, "grad_norm": 2.436635418792066, "learning_rate": 1.7036914671487849e-06, "loss": 0.2986, "step": 967 }, { "epoch": 0.27, "grad_norm": 2.3717874245845163, "learning_rate": 1.7030390763833586e-06, "loss": 0.3382, "step": 968 }, { "epoch": 0.27, "grad_norm": 2.497520195958812, "learning_rate": 1.7023860934220529e-06, "loss": 0.3302, "step": 969 }, { "epoch": 0.27, "grad_norm": 2.4602529075200508, "learning_rate": 1.701732518814899e-06, "loss": 0.2964, "step": 970 }, { "epoch": 0.28, "grad_norm": 2.496811408865411, "learning_rate": 1.7010783531124276e-06, "loss": 0.2837, "step": 971 }, { "epoch": 0.28, "grad_norm": 2.2217169162660713, "learning_rate": 1.7004235968656663e-06, "loss": 0.3015, "step": 972 }, { "epoch": 0.28, "grad_norm": 2.499163380974902, "learning_rate": 1.6997682506261408e-06, "loss": 0.3124, "step": 973 }, { "epoch": 0.28, "grad_norm": 2.4447537512361546, "learning_rate": 1.6991123149458738e-06, "loss": 0.2945, "step": 974 }, { "epoch": 0.28, "grad_norm": 2.362267004446206, "learning_rate": 1.698455790377384e-06, "loss": 0.2863, "step": 975 }, { "epoch": 0.28, "grad_norm": 2.2546056397623135, "learning_rate": 1.6977986774736856e-06, "loss": 0.2822, "step": 976 }, { "epoch": 0.28, "grad_norm": 2.2691350851857135, "learning_rate": 1.6971409767882908e-06, "loss": 0.2974, "step": 977 }, { "epoch": 0.28, "grad_norm": 2.445416620705216, "learning_rate": 1.6964826888752036e-06, "loss": 0.2852, "step": 978 }, { "epoch": 0.28, "grad_norm": 2.384942731710928, "learning_rate": 1.6958238142889256e-06, "loss": 0.2925, "step": 979 }, { "epoch": 0.28, "grad_norm": 2.2714288291286295, "learning_rate": 1.6951643535844507e-06, "loss": 0.2746, "step": 980 }, { "epoch": 0.28, "grad_norm": 2.2375520872504486, "learning_rate": 1.6945043073172669e-06, "loss": 0.3002, "step": 981 }, { "epoch": 0.28, "grad_norm": 2.4784951066376597, "learning_rate": 1.6938436760433563e-06, "loss": 0.3209, "step": 982 }, { "epoch": 0.28, "grad_norm": 2.340283512510198, "learning_rate": 1.6931824603191924e-06, "loss": 0.3039, "step": 983 }, { "epoch": 0.28, "grad_norm": 2.30737661460916, "learning_rate": 1.692520660701742e-06, "loss": 0.2923, "step": 984 }, { "epoch": 0.28, "grad_norm": 2.437715351903717, "learning_rate": 1.691858277748464e-06, "loss": 0.3014, "step": 985 }, { "epoch": 0.28, "grad_norm": 2.5154337603104495, "learning_rate": 1.6911953120173072e-06, "loss": 0.2792, "step": 986 }, { "epoch": 0.28, "grad_norm": 2.4797671932366554, "learning_rate": 1.690531764066713e-06, "loss": 0.2949, "step": 987 }, { "epoch": 0.28, "grad_norm": 2.3547640609593965, "learning_rate": 1.6898676344556116e-06, "loss": 0.2787, "step": 988 }, { "epoch": 0.28, "grad_norm": 2.3656668511347525, "learning_rate": 1.6892029237434247e-06, "loss": 0.3285, "step": 989 }, { "epoch": 0.28, "grad_norm": 2.4444266223198388, "learning_rate": 1.6885376324900627e-06, "loss": 0.314, "step": 990 }, { "epoch": 0.28, "grad_norm": 2.3855107129962514, "learning_rate": 1.6878717612559248e-06, "loss": 0.3095, "step": 991 }, { "epoch": 0.28, "grad_norm": 2.5693108001100673, "learning_rate": 1.6872053106018994e-06, "loss": 0.2997, "step": 992 }, { "epoch": 0.28, "grad_norm": 2.1970928719836182, "learning_rate": 1.686538281089362e-06, "loss": 0.3192, "step": 993 }, { "epoch": 0.28, "grad_norm": 2.3599486714531084, "learning_rate": 1.6858706732801765e-06, "loss": 0.2893, "step": 994 }, { "epoch": 0.28, "grad_norm": 2.297010635268803, "learning_rate": 1.6852024877366944e-06, "loss": 0.2971, "step": 995 }, { "epoch": 0.28, "grad_norm": 2.3034348135405156, "learning_rate": 1.6845337250217524e-06, "loss": 0.3109, "step": 996 }, { "epoch": 0.28, "grad_norm": 2.967206227562683, "learning_rate": 1.6838643856986742e-06, "loss": 0.3199, "step": 997 }, { "epoch": 0.28, "grad_norm": 2.623339369359266, "learning_rate": 1.6831944703312692e-06, "loss": 0.2834, "step": 998 }, { "epoch": 0.28, "grad_norm": 2.53117931729609, "learning_rate": 1.6825239794838325e-06, "loss": 0.3006, "step": 999 }, { "epoch": 0.28, "grad_norm": 2.3882720935542516, "learning_rate": 1.6818529137211426e-06, "loss": 0.3082, "step": 1000 }, { "epoch": 0.28, "grad_norm": 2.55440787423396, "learning_rate": 1.6811812736084634e-06, "loss": 0.3204, "step": 1001 }, { "epoch": 0.28, "grad_norm": 2.321158053976048, "learning_rate": 1.6805090597115424e-06, "loss": 0.319, "step": 1002 }, { "epoch": 0.28, "grad_norm": 2.4872668248685, "learning_rate": 1.67983627259661e-06, "loss": 0.2799, "step": 1003 }, { "epoch": 0.28, "grad_norm": 2.4245140920050603, "learning_rate": 1.67916291283038e-06, "loss": 0.3063, "step": 1004 }, { "epoch": 0.28, "grad_norm": 2.6435579373630245, "learning_rate": 1.678488980980048e-06, "loss": 0.3093, "step": 1005 }, { "epoch": 0.29, "grad_norm": 2.258406979497761, "learning_rate": 1.6778144776132922e-06, "loss": 0.268, "step": 1006 }, { "epoch": 0.29, "grad_norm": 2.9135073664027664, "learning_rate": 1.6771394032982715e-06, "loss": 0.3402, "step": 1007 }, { "epoch": 0.29, "grad_norm": 2.3278350848972984, "learning_rate": 1.6764637586036262e-06, "loss": 0.2839, "step": 1008 }, { "epoch": 0.29, "grad_norm": 2.3248056744204986, "learning_rate": 1.6757875440984765e-06, "loss": 0.2983, "step": 1009 }, { "epoch": 0.29, "grad_norm": 2.439826013803833, "learning_rate": 1.6751107603524234e-06, "loss": 0.3344, "step": 1010 }, { "epoch": 0.29, "grad_norm": 2.2802148297417255, "learning_rate": 1.6744334079355468e-06, "loss": 0.2897, "step": 1011 }, { "epoch": 0.29, "grad_norm": 2.6263572428855366, "learning_rate": 1.6737554874184054e-06, "loss": 0.2902, "step": 1012 }, { "epoch": 0.29, "grad_norm": 2.2664433077616817, "learning_rate": 1.6730769993720374e-06, "loss": 0.2768, "step": 1013 }, { "epoch": 0.29, "grad_norm": 2.429423006143885, "learning_rate": 1.6723979443679581e-06, "loss": 0.3299, "step": 1014 }, { "epoch": 0.29, "grad_norm": 2.416722381378333, "learning_rate": 1.6717183229781608e-06, "loss": 0.295, "step": 1015 }, { "epoch": 0.29, "grad_norm": 2.8299828280374224, "learning_rate": 1.6710381357751153e-06, "loss": 0.3108, "step": 1016 }, { "epoch": 0.29, "grad_norm": 3.0380148513953085, "learning_rate": 1.6703573833317695e-06, "loss": 0.3334, "step": 1017 }, { "epoch": 0.29, "grad_norm": 2.315449694497384, "learning_rate": 1.6696760662215454e-06, "loss": 0.3075, "step": 1018 }, { "epoch": 0.29, "grad_norm": 2.493817939261923, "learning_rate": 1.6689941850183423e-06, "loss": 0.3229, "step": 1019 }, { "epoch": 0.29, "grad_norm": 2.501749294264062, "learning_rate": 1.668311740296534e-06, "loss": 0.3106, "step": 1020 }, { "epoch": 0.29, "grad_norm": 2.289914719495963, "learning_rate": 1.6676287326309684e-06, "loss": 0.309, "step": 1021 }, { "epoch": 0.29, "grad_norm": 2.718396677832358, "learning_rate": 1.666945162596969e-06, "loss": 0.3284, "step": 1022 }, { "epoch": 0.29, "grad_norm": 2.3457700683897236, "learning_rate": 1.6662610307703315e-06, "loss": 0.3157, "step": 1023 }, { "epoch": 0.29, "grad_norm": 2.5448441504576618, "learning_rate": 1.6655763377273258e-06, "loss": 0.3019, "step": 1024 }, { "epoch": 0.29, "grad_norm": 2.4890666738130007, "learning_rate": 1.6648910840446945e-06, "loss": 0.3275, "step": 1025 }, { "epoch": 0.29, "grad_norm": 2.4894253493778624, "learning_rate": 1.6642052702996518e-06, "loss": 0.3149, "step": 1026 }, { "epoch": 0.29, "grad_norm": 2.353041250698328, "learning_rate": 1.663518897069884e-06, "loss": 0.2886, "step": 1027 }, { "epoch": 0.29, "grad_norm": 2.4392572033533066, "learning_rate": 1.662831964933549e-06, "loss": 0.3412, "step": 1028 }, { "epoch": 0.29, "grad_norm": 2.3188088866561403, "learning_rate": 1.662144474469275e-06, "loss": 0.2947, "step": 1029 }, { "epoch": 0.29, "grad_norm": 2.341100743164467, "learning_rate": 1.6614564262561608e-06, "loss": 0.2728, "step": 1030 }, { "epoch": 0.29, "grad_norm": 2.2101531499352434, "learning_rate": 1.660767820873775e-06, "loss": 0.3123, "step": 1031 }, { "epoch": 0.29, "grad_norm": 2.475747069138636, "learning_rate": 1.6600786589021552e-06, "loss": 0.3403, "step": 1032 }, { "epoch": 0.29, "grad_norm": 2.459042145691644, "learning_rate": 1.6593889409218082e-06, "loss": 0.312, "step": 1033 }, { "epoch": 0.29, "grad_norm": 2.5295519497711707, "learning_rate": 1.6586986675137092e-06, "loss": 0.2986, "step": 1034 }, { "epoch": 0.29, "grad_norm": 2.450928118563158, "learning_rate": 1.658007839259301e-06, "loss": 0.3285, "step": 1035 }, { "epoch": 0.29, "grad_norm": 2.1898774628428628, "learning_rate": 1.6573164567404935e-06, "loss": 0.2827, "step": 1036 }, { "epoch": 0.29, "grad_norm": 2.427219719465487, "learning_rate": 1.6566245205396645e-06, "loss": 0.299, "step": 1037 }, { "epoch": 0.29, "grad_norm": 2.3570225217372567, "learning_rate": 1.655932031239657e-06, "loss": 0.2951, "step": 1038 }, { "epoch": 0.29, "grad_norm": 2.3402537457095547, "learning_rate": 1.6552389894237805e-06, "loss": 0.3171, "step": 1039 }, { "epoch": 0.29, "grad_norm": 2.354873029829454, "learning_rate": 1.6545453956758095e-06, "loss": 0.2935, "step": 1040 }, { "epoch": 0.29, "grad_norm": 2.25090458483495, "learning_rate": 1.6538512505799846e-06, "loss": 0.3108, "step": 1041 }, { "epoch": 0.3, "grad_norm": 2.55047815912428, "learning_rate": 1.6531565547210091e-06, "loss": 0.3013, "step": 1042 }, { "epoch": 0.3, "grad_norm": 2.4440385164546696, "learning_rate": 1.6524613086840518e-06, "loss": 0.3105, "step": 1043 }, { "epoch": 0.3, "grad_norm": 2.391941906752125, "learning_rate": 1.6517655130547435e-06, "loss": 0.2957, "step": 1044 }, { "epoch": 0.3, "grad_norm": 2.435031979636581, "learning_rate": 1.6510691684191792e-06, "loss": 0.3012, "step": 1045 }, { "epoch": 0.3, "grad_norm": 2.289453395649587, "learning_rate": 1.6503722753639152e-06, "loss": 0.2879, "step": 1046 }, { "epoch": 0.3, "grad_norm": 2.7232938391584423, "learning_rate": 1.6496748344759711e-06, "loss": 0.2899, "step": 1047 }, { "epoch": 0.3, "grad_norm": 2.3480597213292884, "learning_rate": 1.6489768463428271e-06, "loss": 0.2844, "step": 1048 }, { "epoch": 0.3, "grad_norm": 2.2274801914857827, "learning_rate": 1.6482783115524236e-06, "loss": 0.3026, "step": 1049 }, { "epoch": 0.3, "grad_norm": 2.350786936769416, "learning_rate": 1.6475792306931634e-06, "loss": 0.3025, "step": 1050 }, { "epoch": 0.3, "grad_norm": 2.455094703349108, "learning_rate": 1.646879604353908e-06, "loss": 0.279, "step": 1051 }, { "epoch": 0.3, "grad_norm": 2.749072216613041, "learning_rate": 1.6461794331239784e-06, "loss": 0.2868, "step": 1052 }, { "epoch": 0.3, "grad_norm": 2.3961592307243933, "learning_rate": 1.6454787175931545e-06, "loss": 0.3241, "step": 1053 }, { "epoch": 0.3, "grad_norm": 2.3802912636459324, "learning_rate": 1.6447774583516757e-06, "loss": 0.3184, "step": 1054 }, { "epoch": 0.3, "grad_norm": 2.3578994334145595, "learning_rate": 1.6440756559902378e-06, "loss": 0.2992, "step": 1055 }, { "epoch": 0.3, "grad_norm": 2.669831279613677, "learning_rate": 1.6433733110999955e-06, "loss": 0.3343, "step": 1056 }, { "epoch": 0.3, "grad_norm": 2.3566063200933205, "learning_rate": 1.64267042427256e-06, "loss": 0.2962, "step": 1057 }, { "epoch": 0.3, "grad_norm": 2.9095379633908793, "learning_rate": 1.6419669960999988e-06, "loss": 0.3049, "step": 1058 }, { "epoch": 0.3, "grad_norm": 2.4085390520917045, "learning_rate": 1.6412630271748353e-06, "loss": 0.3121, "step": 1059 }, { "epoch": 0.3, "grad_norm": 2.3603280046033523, "learning_rate": 1.640558518090049e-06, "loss": 0.2943, "step": 1060 }, { "epoch": 0.3, "grad_norm": 2.486479238370381, "learning_rate": 1.6398534694390738e-06, "loss": 0.3191, "step": 1061 }, { "epoch": 0.3, "grad_norm": 2.3336082601675407, "learning_rate": 1.6391478818157984e-06, "loss": 0.3227, "step": 1062 }, { "epoch": 0.3, "grad_norm": 2.40928700934904, "learning_rate": 1.6384417558145653e-06, "loss": 0.2902, "step": 1063 }, { "epoch": 0.3, "grad_norm": 2.320113452541132, "learning_rate": 1.637735092030171e-06, "loss": 0.294, "step": 1064 }, { "epoch": 0.3, "grad_norm": 2.1722153868801457, "learning_rate": 1.637027891057864e-06, "loss": 0.2874, "step": 1065 }, { "epoch": 0.3, "grad_norm": 2.4440111925983836, "learning_rate": 1.6363201534933461e-06, "loss": 0.3141, "step": 1066 }, { "epoch": 0.3, "grad_norm": 2.3338478949662225, "learning_rate": 1.6356118799327714e-06, "loss": 0.3092, "step": 1067 }, { "epoch": 0.3, "grad_norm": 2.5121442311511917, "learning_rate": 1.634903070972744e-06, "loss": 0.3397, "step": 1068 }, { "epoch": 0.3, "grad_norm": 2.5781463798701902, "learning_rate": 1.634193727210321e-06, "loss": 0.3104, "step": 1069 }, { "epoch": 0.3, "grad_norm": 2.5961627840525114, "learning_rate": 1.6334838492430083e-06, "loss": 0.3101, "step": 1070 }, { "epoch": 0.3, "grad_norm": 2.267179058947479, "learning_rate": 1.6327734376687627e-06, "loss": 0.283, "step": 1071 }, { "epoch": 0.3, "grad_norm": 2.3444605330977066, "learning_rate": 1.6320624930859904e-06, "loss": 0.3122, "step": 1072 }, { "epoch": 0.3, "grad_norm": 2.993127129778634, "learning_rate": 1.6313510160935456e-06, "loss": 0.3471, "step": 1073 }, { "epoch": 0.3, "grad_norm": 2.2872650945211332, "learning_rate": 1.6306390072907325e-06, "loss": 0.3169, "step": 1074 }, { "epoch": 0.3, "grad_norm": 2.724604471623292, "learning_rate": 1.6299264672773023e-06, "loss": 0.3031, "step": 1075 }, { "epoch": 0.3, "grad_norm": 2.913751858855762, "learning_rate": 1.6292133966534538e-06, "loss": 0.284, "step": 1076 }, { "epoch": 0.31, "grad_norm": 2.286247266050023, "learning_rate": 1.6284997960198327e-06, "loss": 0.3257, "step": 1077 }, { "epoch": 0.31, "grad_norm": 2.3860053312723952, "learning_rate": 1.6277856659775318e-06, "loss": 0.315, "step": 1078 }, { "epoch": 0.31, "grad_norm": 2.453199624252134, "learning_rate": 1.6270710071280886e-06, "loss": 0.3262, "step": 1079 }, { "epoch": 0.31, "grad_norm": 2.622679958245868, "learning_rate": 1.6263558200734874e-06, "loss": 0.2872, "step": 1080 }, { "epoch": 0.31, "grad_norm": 2.3486987427194266, "learning_rate": 1.6256401054161564e-06, "loss": 0.3164, "step": 1081 }, { "epoch": 0.31, "grad_norm": 2.217339421310054, "learning_rate": 1.6249238637589686e-06, "loss": 0.2927, "step": 1082 }, { "epoch": 0.31, "grad_norm": 2.350362880482745, "learning_rate": 1.6242070957052408e-06, "loss": 0.327, "step": 1083 }, { "epoch": 0.31, "grad_norm": 2.291927833619107, "learning_rate": 1.6234898018587336e-06, "loss": 0.281, "step": 1084 }, { "epoch": 0.31, "grad_norm": 2.3219095991433525, "learning_rate": 1.62277198282365e-06, "loss": 0.292, "step": 1085 }, { "epoch": 0.31, "grad_norm": 2.3916455188615386, "learning_rate": 1.6220536392046355e-06, "loss": 0.2984, "step": 1086 }, { "epoch": 0.31, "grad_norm": 2.858586614338182, "learning_rate": 1.621334771606778e-06, "loss": 0.3256, "step": 1087 }, { "epoch": 0.31, "grad_norm": 2.563926714028576, "learning_rate": 1.620615380635606e-06, "loss": 0.298, "step": 1088 }, { "epoch": 0.31, "grad_norm": 2.3780229371631334, "learning_rate": 1.6198954668970892e-06, "loss": 0.3113, "step": 1089 }, { "epoch": 0.31, "grad_norm": 2.3717718934908736, "learning_rate": 1.6191750309976375e-06, "loss": 0.3097, "step": 1090 }, { "epoch": 0.31, "grad_norm": 2.26879452699283, "learning_rate": 1.6184540735441011e-06, "loss": 0.2758, "step": 1091 }, { "epoch": 0.31, "grad_norm": 2.3545499465443673, "learning_rate": 1.617732595143769e-06, "loss": 0.2951, "step": 1092 }, { "epoch": 0.31, "grad_norm": 2.244857733670842, "learning_rate": 1.6170105964043693e-06, "loss": 0.2951, "step": 1093 }, { "epoch": 0.31, "grad_norm": 2.3294945365100497, "learning_rate": 1.6162880779340685e-06, "loss": 0.2943, "step": 1094 }, { "epoch": 0.31, "grad_norm": 2.3426596108235653, "learning_rate": 1.61556504034147e-06, "loss": 0.3315, "step": 1095 }, { "epoch": 0.31, "grad_norm": 2.4575241805037504, "learning_rate": 1.6148414842356157e-06, "loss": 0.3015, "step": 1096 }, { "epoch": 0.31, "grad_norm": 2.4324516901036253, "learning_rate": 1.6141174102259835e-06, "loss": 0.2792, "step": 1097 }, { "epoch": 0.31, "grad_norm": 2.4290612852471347, "learning_rate": 1.6133928189224886e-06, "loss": 0.3176, "step": 1098 }, { "epoch": 0.31, "grad_norm": 2.370996130318332, "learning_rate": 1.61266771093548e-06, "loss": 0.2853, "step": 1099 }, { "epoch": 0.31, "grad_norm": 2.770906603532933, "learning_rate": 1.6119420868757429e-06, "loss": 0.3126, "step": 1100 }, { "epoch": 0.31, "grad_norm": 2.3587875090211288, "learning_rate": 1.6112159473544988e-06, "loss": 0.2996, "step": 1101 }, { "epoch": 0.31, "grad_norm": 2.213960910839405, "learning_rate": 1.6104892929834006e-06, "loss": 0.2793, "step": 1102 }, { "epoch": 0.31, "grad_norm": 2.453113592013504, "learning_rate": 1.6097621243745369e-06, "loss": 0.3057, "step": 1103 }, { "epoch": 0.31, "grad_norm": 2.458016611313274, "learning_rate": 1.6090344421404285e-06, "loss": 0.2673, "step": 1104 }, { "epoch": 0.31, "grad_norm": 2.473355007432182, "learning_rate": 1.6083062468940294e-06, "loss": 0.3012, "step": 1105 }, { "epoch": 0.31, "grad_norm": 2.336731158238361, "learning_rate": 1.607577539248725e-06, "loss": 0.2982, "step": 1106 }, { "epoch": 0.31, "grad_norm": 2.431007377897008, "learning_rate": 1.606848319818333e-06, "loss": 0.3029, "step": 1107 }, { "epoch": 0.31, "grad_norm": 2.484485654028409, "learning_rate": 1.6061185892171021e-06, "loss": 0.3235, "step": 1108 }, { "epoch": 0.31, "grad_norm": 2.1469506553648987, "learning_rate": 1.6053883480597112e-06, "loss": 0.2604, "step": 1109 }, { "epoch": 0.31, "grad_norm": 2.3229347997347713, "learning_rate": 1.60465759696127e-06, "loss": 0.3256, "step": 1110 }, { "epoch": 0.31, "grad_norm": 2.802832295421606, "learning_rate": 1.6039263365373167e-06, "loss": 0.2955, "step": 1111 }, { "epoch": 0.32, "grad_norm": 2.2623099141043825, "learning_rate": 1.6031945674038188e-06, "loss": 0.2703, "step": 1112 }, { "epoch": 0.32, "grad_norm": 2.3600259797934546, "learning_rate": 1.6024622901771734e-06, "loss": 0.2909, "step": 1113 }, { "epoch": 0.32, "grad_norm": 2.4897006659817715, "learning_rate": 1.6017295054742044e-06, "loss": 0.2968, "step": 1114 }, { "epoch": 0.32, "grad_norm": 2.684689300603362, "learning_rate": 1.6009962139121634e-06, "loss": 0.2989, "step": 1115 }, { "epoch": 0.32, "grad_norm": 2.4044760003269716, "learning_rate": 1.600262416108729e-06, "loss": 0.2997, "step": 1116 }, { "epoch": 0.32, "grad_norm": 2.3201320285410962, "learning_rate": 1.5995281126820066e-06, "loss": 0.3071, "step": 1117 }, { "epoch": 0.32, "grad_norm": 2.6468821662036515, "learning_rate": 1.598793304250527e-06, "loss": 0.286, "step": 1118 }, { "epoch": 0.32, "grad_norm": 2.3639503488566613, "learning_rate": 1.5980579914332465e-06, "loss": 0.2914, "step": 1119 }, { "epoch": 0.32, "grad_norm": 2.6193329887287207, "learning_rate": 1.5973221748495468e-06, "loss": 0.2952, "step": 1120 }, { "epoch": 0.32, "grad_norm": 2.679692415018323, "learning_rate": 1.5965858551192327e-06, "loss": 0.2953, "step": 1121 }, { "epoch": 0.32, "grad_norm": 2.3437150532341953, "learning_rate": 1.5958490328625347e-06, "loss": 0.3032, "step": 1122 }, { "epoch": 0.32, "grad_norm": 2.3239952811715567, "learning_rate": 1.5951117087001046e-06, "loss": 0.2854, "step": 1123 }, { "epoch": 0.32, "grad_norm": 2.385086510499303, "learning_rate": 1.5943738832530182e-06, "loss": 0.2979, "step": 1124 }, { "epoch": 0.32, "grad_norm": 2.4806154461787604, "learning_rate": 1.5936355571427733e-06, "loss": 0.2966, "step": 1125 }, { "epoch": 0.32, "grad_norm": 2.216112548739291, "learning_rate": 1.5928967309912888e-06, "loss": 0.3029, "step": 1126 }, { "epoch": 0.32, "grad_norm": 2.572934206919917, "learning_rate": 1.5921574054209063e-06, "loss": 0.3056, "step": 1127 }, { "epoch": 0.32, "grad_norm": 2.5508527353207016, "learning_rate": 1.5914175810543866e-06, "loss": 0.2833, "step": 1128 }, { "epoch": 0.32, "grad_norm": 2.6384165447425425, "learning_rate": 1.590677258514911e-06, "loss": 0.3049, "step": 1129 }, { "epoch": 0.32, "grad_norm": 2.509989832211171, "learning_rate": 1.5899364384260811e-06, "loss": 0.2929, "step": 1130 }, { "epoch": 0.32, "grad_norm": 2.5231713344758324, "learning_rate": 1.5891951214119165e-06, "loss": 0.294, "step": 1131 }, { "epoch": 0.32, "grad_norm": 2.281755230252254, "learning_rate": 1.5884533080968569e-06, "loss": 0.2919, "step": 1132 }, { "epoch": 0.32, "grad_norm": 2.3669782426311925, "learning_rate": 1.5877109991057587e-06, "loss": 0.3073, "step": 1133 }, { "epoch": 0.32, "grad_norm": 2.362565174713835, "learning_rate": 1.5869681950638959e-06, "loss": 0.2966, "step": 1134 }, { "epoch": 0.32, "grad_norm": 2.438987608826221, "learning_rate": 1.5862248965969603e-06, "loss": 0.2823, "step": 1135 }, { "epoch": 0.32, "grad_norm": 2.3746503313743537, "learning_rate": 1.5854811043310596e-06, "loss": 0.2849, "step": 1136 }, { "epoch": 0.32, "grad_norm": 2.5131572164621208, "learning_rate": 1.5847368188927179e-06, "loss": 0.2863, "step": 1137 }, { "epoch": 0.32, "grad_norm": 2.5408742994134594, "learning_rate": 1.5839920409088743e-06, "loss": 0.2736, "step": 1138 }, { "epoch": 0.32, "grad_norm": 2.3560444580090807, "learning_rate": 1.5832467710068824e-06, "loss": 0.2994, "step": 1139 }, { "epoch": 0.32, "grad_norm": 2.4639874108624364, "learning_rate": 1.5825010098145116e-06, "loss": 0.3127, "step": 1140 }, { "epoch": 0.32, "grad_norm": 2.1704217818052713, "learning_rate": 1.5817547579599432e-06, "loss": 0.2887, "step": 1141 }, { "epoch": 0.32, "grad_norm": 2.4916333471595937, "learning_rate": 1.5810080160717734e-06, "loss": 0.2873, "step": 1142 }, { "epoch": 0.32, "grad_norm": 2.240153940362432, "learning_rate": 1.5802607847790107e-06, "loss": 0.278, "step": 1143 }, { "epoch": 0.32, "grad_norm": 2.379603346081565, "learning_rate": 1.5795130647110753e-06, "loss": 0.3027, "step": 1144 }, { "epoch": 0.32, "grad_norm": 3.356341932955021, "learning_rate": 1.5787648564977998e-06, "loss": 0.3085, "step": 1145 }, { "epoch": 0.32, "grad_norm": 2.4227890292727765, "learning_rate": 1.5780161607694275e-06, "loss": 0.3099, "step": 1146 }, { "epoch": 0.32, "grad_norm": 2.6094467312618272, "learning_rate": 1.577266978156613e-06, "loss": 0.3307, "step": 1147 }, { "epoch": 0.33, "grad_norm": 2.4273403176635058, "learning_rate": 1.5765173092904201e-06, "loss": 0.2923, "step": 1148 }, { "epoch": 0.33, "grad_norm": 2.4037099389004934, "learning_rate": 1.5757671548023228e-06, "loss": 0.3346, "step": 1149 }, { "epoch": 0.33, "grad_norm": 2.1568447136717457, "learning_rate": 1.5750165153242044e-06, "loss": 0.2583, "step": 1150 }, { "epoch": 0.33, "grad_norm": 2.28801464145824, "learning_rate": 1.5742653914883558e-06, "loss": 0.2924, "step": 1151 }, { "epoch": 0.33, "grad_norm": 2.529364561744506, "learning_rate": 1.5735137839274773e-06, "loss": 0.2886, "step": 1152 }, { "epoch": 0.33, "grad_norm": 2.327560334688552, "learning_rate": 1.5727616932746746e-06, "loss": 0.285, "step": 1153 }, { "epoch": 0.33, "grad_norm": 2.4283275210499173, "learning_rate": 1.5720091201634627e-06, "loss": 0.299, "step": 1154 }, { "epoch": 0.33, "grad_norm": 2.5399568436656264, "learning_rate": 1.5712560652277609e-06, "loss": 0.2833, "step": 1155 }, { "epoch": 0.33, "grad_norm": 2.3646366116477813, "learning_rate": 1.570502529101896e-06, "loss": 0.3011, "step": 1156 }, { "epoch": 0.33, "grad_norm": 2.94102514169586, "learning_rate": 1.5697485124205989e-06, "loss": 0.3032, "step": 1157 }, { "epoch": 0.33, "grad_norm": 2.36875518092041, "learning_rate": 1.568994015819006e-06, "loss": 0.2749, "step": 1158 }, { "epoch": 0.33, "grad_norm": 2.534169488455715, "learning_rate": 1.5682390399326582e-06, "loss": 0.3512, "step": 1159 }, { "epoch": 0.33, "grad_norm": 2.3107725895023434, "learning_rate": 1.567483585397499e-06, "loss": 0.2657, "step": 1160 }, { "epoch": 0.33, "grad_norm": 2.6748111833306107, "learning_rate": 1.5667276528498763e-06, "loss": 0.3012, "step": 1161 }, { "epoch": 0.33, "grad_norm": 2.2766830635131297, "learning_rate": 1.56597124292654e-06, "loss": 0.284, "step": 1162 }, { "epoch": 0.33, "grad_norm": 2.542009323160178, "learning_rate": 1.5652143562646413e-06, "loss": 0.2992, "step": 1163 }, { "epoch": 0.33, "grad_norm": 2.316537583857316, "learning_rate": 1.5644569935017355e-06, "loss": 0.2771, "step": 1164 }, { "epoch": 0.33, "grad_norm": 2.5038538788727873, "learning_rate": 1.563699155275776e-06, "loss": 0.3007, "step": 1165 }, { "epoch": 0.33, "grad_norm": 2.4159983582420788, "learning_rate": 1.5629408422251192e-06, "loss": 0.3034, "step": 1166 }, { "epoch": 0.33, "grad_norm": 2.4341610746771356, "learning_rate": 1.562182054988519e-06, "loss": 0.3088, "step": 1167 }, { "epoch": 0.33, "grad_norm": 2.357812160161354, "learning_rate": 1.5614227942051307e-06, "loss": 0.2786, "step": 1168 }, { "epoch": 0.33, "grad_norm": 2.3163132363746737, "learning_rate": 1.5606630605145081e-06, "loss": 0.3147, "step": 1169 }, { "epoch": 0.33, "grad_norm": 2.2351859561010263, "learning_rate": 1.5599028545566026e-06, "loss": 0.3012, "step": 1170 }, { "epoch": 0.33, "grad_norm": 2.4339611497994285, "learning_rate": 1.5591421769717642e-06, "loss": 0.2826, "step": 1171 }, { "epoch": 0.33, "grad_norm": 2.624244763187848, "learning_rate": 1.5583810284007393e-06, "loss": 0.3324, "step": 1172 }, { "epoch": 0.33, "grad_norm": 2.547134698048815, "learning_rate": 1.5576194094846722e-06, "loss": 0.3079, "step": 1173 }, { "epoch": 0.33, "grad_norm": 2.527816531870315, "learning_rate": 1.5568573208651023e-06, "loss": 0.2957, "step": 1174 }, { "epoch": 0.33, "grad_norm": 2.2182054977613723, "learning_rate": 1.5560947631839652e-06, "loss": 0.2692, "step": 1175 }, { "epoch": 0.33, "grad_norm": 2.265694858532665, "learning_rate": 1.5553317370835913e-06, "loss": 0.2751, "step": 1176 }, { "epoch": 0.33, "grad_norm": 2.443214358088767, "learning_rate": 1.5545682432067063e-06, "loss": 0.3132, "step": 1177 }, { "epoch": 0.33, "grad_norm": 2.5864134329362356, "learning_rate": 1.5538042821964292e-06, "loss": 0.275, "step": 1178 }, { "epoch": 0.33, "grad_norm": 2.492520363058183, "learning_rate": 1.5530398546962729e-06, "loss": 0.3168, "step": 1179 }, { "epoch": 0.33, "grad_norm": 2.6570124876738745, "learning_rate": 1.5522749613501423e-06, "loss": 0.2994, "step": 1180 }, { "epoch": 0.33, "grad_norm": 2.4265104474117507, "learning_rate": 1.5515096028023359e-06, "loss": 0.3041, "step": 1181 }, { "epoch": 0.33, "grad_norm": 2.807562758250271, "learning_rate": 1.5507437796975434e-06, "loss": 0.2973, "step": 1182 }, { "epoch": 0.34, "grad_norm": 2.3246387128523778, "learning_rate": 1.5499774926808464e-06, "loss": 0.2884, "step": 1183 }, { "epoch": 0.34, "grad_norm": 2.2932871149447402, "learning_rate": 1.5492107423977166e-06, "loss": 0.2855, "step": 1184 }, { "epoch": 0.34, "grad_norm": 2.435228543816678, "learning_rate": 1.548443529494016e-06, "loss": 0.3097, "step": 1185 }, { "epoch": 0.34, "grad_norm": 2.3510824680139604, "learning_rate": 1.5476758546159966e-06, "loss": 0.2742, "step": 1186 }, { "epoch": 0.34, "grad_norm": 2.392760312472469, "learning_rate": 1.5469077184102996e-06, "loss": 0.3302, "step": 1187 }, { "epoch": 0.34, "grad_norm": 2.343553702909353, "learning_rate": 1.5461391215239545e-06, "loss": 0.2465, "step": 1188 }, { "epoch": 0.34, "grad_norm": 2.248803321374722, "learning_rate": 1.545370064604379e-06, "loss": 0.2789, "step": 1189 }, { "epoch": 0.34, "grad_norm": 2.2905096127758915, "learning_rate": 1.544600548299378e-06, "loss": 0.2953, "step": 1190 }, { "epoch": 0.34, "grad_norm": 2.158849950842058, "learning_rate": 1.5438305732571442e-06, "loss": 0.2784, "step": 1191 }, { "epoch": 0.34, "grad_norm": 2.7014072777834843, "learning_rate": 1.543060140126255e-06, "loss": 0.2989, "step": 1192 }, { "epoch": 0.34, "grad_norm": 2.4406118157452497, "learning_rate": 1.5422892495556764e-06, "loss": 0.2828, "step": 1193 }, { "epoch": 0.34, "grad_norm": 2.309427539691521, "learning_rate": 1.5415179021947565e-06, "loss": 0.2881, "step": 1194 }, { "epoch": 0.34, "grad_norm": 2.3087869140253527, "learning_rate": 1.5407460986932309e-06, "loss": 0.3104, "step": 1195 }, { "epoch": 0.34, "grad_norm": 2.4365948516037017, "learning_rate": 1.5399738397012176e-06, "loss": 0.3117, "step": 1196 }, { "epoch": 0.34, "grad_norm": 2.472925420715825, "learning_rate": 1.5392011258692197e-06, "loss": 0.297, "step": 1197 }, { "epoch": 0.34, "grad_norm": 2.4954071083244327, "learning_rate": 1.538427957848122e-06, "loss": 0.2885, "step": 1198 }, { "epoch": 0.34, "grad_norm": 2.370015598392829, "learning_rate": 1.5376543362891932e-06, "loss": 0.3013, "step": 1199 }, { "epoch": 0.34, "grad_norm": 2.352657234394544, "learning_rate": 1.5368802618440829e-06, "loss": 0.2781, "step": 1200 }, { "epoch": 0.34, "grad_norm": 2.413009305045956, "learning_rate": 1.5361057351648228e-06, "loss": 0.294, "step": 1201 }, { "epoch": 0.34, "grad_norm": 2.4136284994593225, "learning_rate": 1.5353307569038254e-06, "loss": 0.3016, "step": 1202 }, { "epoch": 0.34, "grad_norm": 2.5447870948805607, "learning_rate": 1.5345553277138846e-06, "loss": 0.3485, "step": 1203 }, { "epoch": 0.34, "grad_norm": 2.230921132142122, "learning_rate": 1.5337794482481714e-06, "loss": 0.2858, "step": 1204 }, { "epoch": 0.34, "grad_norm": 2.38145420246431, "learning_rate": 1.5330031191602393e-06, "loss": 0.2674, "step": 1205 }, { "epoch": 0.34, "grad_norm": 2.275552437256254, "learning_rate": 1.5322263411040185e-06, "loss": 0.2731, "step": 1206 }, { "epoch": 0.34, "grad_norm": 2.5017524293312565, "learning_rate": 1.5314491147338178e-06, "loss": 0.276, "step": 1207 }, { "epoch": 0.34, "grad_norm": 2.3438144876797264, "learning_rate": 1.530671440704324e-06, "loss": 0.2789, "step": 1208 }, { "epoch": 0.34, "grad_norm": 2.2085904618308208, "learning_rate": 1.5298933196706008e-06, "loss": 0.2342, "step": 1209 }, { "epoch": 0.34, "grad_norm": 2.4759768263587394, "learning_rate": 1.5291147522880884e-06, "loss": 0.2941, "step": 1210 }, { "epoch": 0.34, "grad_norm": 2.4202423096646886, "learning_rate": 1.528335739212603e-06, "loss": 0.3279, "step": 1211 }, { "epoch": 0.34, "grad_norm": 2.3966984192951535, "learning_rate": 1.5275562811003363e-06, "loss": 0.2772, "step": 1212 }, { "epoch": 0.34, "grad_norm": 2.199944532742987, "learning_rate": 1.5267763786078541e-06, "loss": 0.2842, "step": 1213 }, { "epoch": 0.34, "grad_norm": 2.508334908719426, "learning_rate": 1.525996032392098e-06, "loss": 0.2911, "step": 1214 }, { "epoch": 0.34, "grad_norm": 2.2262275947096417, "learning_rate": 1.525215243110382e-06, "loss": 0.292, "step": 1215 }, { "epoch": 0.34, "grad_norm": 2.2895553897355208, "learning_rate": 1.5244340114203943e-06, "loss": 0.2798, "step": 1216 }, { "epoch": 0.34, "grad_norm": 2.2505915029751153, "learning_rate": 1.5236523379801951e-06, "loss": 0.2813, "step": 1217 }, { "epoch": 0.35, "grad_norm": 2.5301472784695105, "learning_rate": 1.522870223448217e-06, "loss": 0.3055, "step": 1218 }, { "epoch": 0.35, "grad_norm": 2.467104811055958, "learning_rate": 1.5220876684832638e-06, "loss": 0.3138, "step": 1219 }, { "epoch": 0.35, "grad_norm": 2.4211238944654565, "learning_rate": 1.5213046737445105e-06, "loss": 0.2889, "step": 1220 }, { "epoch": 0.35, "grad_norm": 2.4038218910709306, "learning_rate": 1.5205212398915032e-06, "loss": 0.2955, "step": 1221 }, { "epoch": 0.35, "grad_norm": 2.233445129687268, "learning_rate": 1.5197373675841569e-06, "loss": 0.2641, "step": 1222 }, { "epoch": 0.35, "grad_norm": 2.6361531058162115, "learning_rate": 1.5189530574827565e-06, "loss": 0.2966, "step": 1223 }, { "epoch": 0.35, "grad_norm": 2.4076912906486867, "learning_rate": 1.518168310247955e-06, "loss": 0.3003, "step": 1224 }, { "epoch": 0.35, "grad_norm": 2.4118387221922744, "learning_rate": 1.5173831265407747e-06, "loss": 0.311, "step": 1225 }, { "epoch": 0.35, "grad_norm": 2.8245527076546093, "learning_rate": 1.5165975070226043e-06, "loss": 0.3364, "step": 1226 }, { "epoch": 0.35, "grad_norm": 2.602810109791664, "learning_rate": 1.515811452355201e-06, "loss": 0.3041, "step": 1227 }, { "epoch": 0.35, "grad_norm": 2.347826721773315, "learning_rate": 1.5150249632006868e-06, "loss": 0.275, "step": 1228 }, { "epoch": 0.35, "grad_norm": 2.4794242356386733, "learning_rate": 1.5142380402215518e-06, "loss": 0.3054, "step": 1229 }, { "epoch": 0.35, "grad_norm": 2.577687203706355, "learning_rate": 1.5134506840806496e-06, "loss": 0.3037, "step": 1230 }, { "epoch": 0.35, "grad_norm": 2.3642776332377244, "learning_rate": 1.5126628954411999e-06, "loss": 0.3193, "step": 1231 }, { "epoch": 0.35, "grad_norm": 2.8048821645424717, "learning_rate": 1.5118746749667862e-06, "loss": 0.3046, "step": 1232 }, { "epoch": 0.35, "grad_norm": 2.344113545456005, "learning_rate": 1.5110860233213554e-06, "loss": 0.2815, "step": 1233 }, { "epoch": 0.35, "grad_norm": 2.281481302849905, "learning_rate": 1.5102969411692183e-06, "loss": 0.3236, "step": 1234 }, { "epoch": 0.35, "grad_norm": 2.276485093800136, "learning_rate": 1.5095074291750485e-06, "loss": 0.2774, "step": 1235 }, { "epoch": 0.35, "grad_norm": 2.338175236841784, "learning_rate": 1.5087174880038806e-06, "loss": 0.2859, "step": 1236 }, { "epoch": 0.35, "grad_norm": 2.3029685841528056, "learning_rate": 1.5079271183211116e-06, "loss": 0.2911, "step": 1237 }, { "epoch": 0.35, "grad_norm": 2.407994836006956, "learning_rate": 1.5071363207924992e-06, "loss": 0.3106, "step": 1238 }, { "epoch": 0.35, "grad_norm": 2.5374727320190633, "learning_rate": 1.5063450960841614e-06, "loss": 0.2836, "step": 1239 }, { "epoch": 0.35, "grad_norm": 2.3499467670805023, "learning_rate": 1.5055534448625764e-06, "loss": 0.2795, "step": 1240 }, { "epoch": 0.35, "grad_norm": 2.4266429590953495, "learning_rate": 1.5047613677945808e-06, "loss": 0.2749, "step": 1241 }, { "epoch": 0.35, "grad_norm": 2.4657170841490976, "learning_rate": 1.503968865547371e-06, "loss": 0.2837, "step": 1242 }, { "epoch": 0.35, "grad_norm": 5.0976536509730455, "learning_rate": 1.5031759387885007e-06, "loss": 0.3269, "step": 1243 }, { "epoch": 0.35, "grad_norm": 2.628231359188434, "learning_rate": 1.5023825881858818e-06, "loss": 0.3152, "step": 1244 }, { "epoch": 0.35, "grad_norm": 2.4756832958452573, "learning_rate": 1.5015888144077824e-06, "loss": 0.3073, "step": 1245 }, { "epoch": 0.35, "grad_norm": 2.298512467341184, "learning_rate": 1.5007946181228283e-06, "loss": 0.3061, "step": 1246 }, { "epoch": 0.35, "grad_norm": 2.52945706170183, "learning_rate": 1.5e-06, "loss": 0.3178, "step": 1247 }, { "epoch": 0.35, "grad_norm": 2.5724615855504926, "learning_rate": 1.4992049607086339e-06, "loss": 0.3417, "step": 1248 }, { "epoch": 0.35, "grad_norm": 2.727288725787042, "learning_rate": 1.4984095009184212e-06, "loss": 0.3349, "step": 1249 }, { "epoch": 0.35, "grad_norm": 2.350197638440752, "learning_rate": 1.497613621299407e-06, "loss": 0.3041, "step": 1250 }, { "epoch": 0.35, "grad_norm": 2.7132496007296156, "learning_rate": 1.4968173225219901e-06, "loss": 0.313, "step": 1251 }, { "epoch": 0.35, "grad_norm": 2.5199034855021836, "learning_rate": 1.496020605256923e-06, "loss": 0.3085, "step": 1252 }, { "epoch": 0.36, "grad_norm": 2.2147023860745105, "learning_rate": 1.4952234701753095e-06, "loss": 0.2737, "step": 1253 }, { "epoch": 0.36, "grad_norm": 2.297883065371963, "learning_rate": 1.4944259179486065e-06, "loss": 0.2783, "step": 1254 }, { "epoch": 0.36, "grad_norm": 2.3449804782716774, "learning_rate": 1.493627949248622e-06, "loss": 0.314, "step": 1255 }, { "epoch": 0.36, "grad_norm": 2.4267078408027443, "learning_rate": 1.492829564747514e-06, "loss": 0.3169, "step": 1256 }, { "epoch": 0.36, "grad_norm": 2.6432791420563784, "learning_rate": 1.492030765117792e-06, "loss": 0.3107, "step": 1257 }, { "epoch": 0.36, "grad_norm": 2.7012532465585966, "learning_rate": 1.4912315510323137e-06, "loss": 0.329, "step": 1258 }, { "epoch": 0.36, "grad_norm": 2.486228397791336, "learning_rate": 1.4904319231642876e-06, "loss": 0.3027, "step": 1259 }, { "epoch": 0.36, "grad_norm": 2.41871838980173, "learning_rate": 1.4896318821872696e-06, "loss": 0.3126, "step": 1260 }, { "epoch": 0.36, "grad_norm": 2.5420371517098084, "learning_rate": 1.4888314287751638e-06, "loss": 0.3158, "step": 1261 }, { "epoch": 0.36, "grad_norm": 2.3839764981648215, "learning_rate": 1.488030563602222e-06, "loss": 0.3023, "step": 1262 }, { "epoch": 0.36, "grad_norm": 2.4532950147848407, "learning_rate": 1.4872292873430424e-06, "loss": 0.2901, "step": 1263 }, { "epoch": 0.36, "grad_norm": 2.5054399027116503, "learning_rate": 1.4864276006725698e-06, "loss": 0.3057, "step": 1264 }, { "epoch": 0.36, "grad_norm": 2.477920531834612, "learning_rate": 1.4856255042660943e-06, "loss": 0.2954, "step": 1265 }, { "epoch": 0.36, "grad_norm": 3.935540643162236, "learning_rate": 1.484822998799252e-06, "loss": 0.2844, "step": 1266 }, { "epoch": 0.36, "grad_norm": 2.314201674178338, "learning_rate": 1.4840200849480225e-06, "loss": 0.2863, "step": 1267 }, { "epoch": 0.36, "grad_norm": 2.3731413875042193, "learning_rate": 1.4832167633887305e-06, "loss": 0.3051, "step": 1268 }, { "epoch": 0.36, "grad_norm": 2.3892817055908777, "learning_rate": 1.482413034798043e-06, "loss": 0.3129, "step": 1269 }, { "epoch": 0.36, "grad_norm": 2.302263982367759, "learning_rate": 1.4816088998529706e-06, "loss": 0.2664, "step": 1270 }, { "epoch": 0.36, "grad_norm": 2.6512705403947354, "learning_rate": 1.480804359230866e-06, "loss": 0.2965, "step": 1271 }, { "epoch": 0.36, "grad_norm": 2.347288821858566, "learning_rate": 1.4799994136094232e-06, "loss": 0.301, "step": 1272 }, { "epoch": 0.36, "grad_norm": 2.5333151896115873, "learning_rate": 1.4791940636666782e-06, "loss": 0.3012, "step": 1273 }, { "epoch": 0.36, "grad_norm": 2.5264247786210148, "learning_rate": 1.4783883100810073e-06, "loss": 0.3196, "step": 1274 }, { "epoch": 0.36, "grad_norm": 2.462515547861249, "learning_rate": 1.4775821535311259e-06, "loss": 0.3057, "step": 1275 }, { "epoch": 0.36, "grad_norm": 2.447574287258765, "learning_rate": 1.47677559469609e-06, "loss": 0.3101, "step": 1276 }, { "epoch": 0.36, "grad_norm": 2.4266565567789664, "learning_rate": 1.4759686342552943e-06, "loss": 0.271, "step": 1277 }, { "epoch": 0.36, "grad_norm": 2.2897497408615908, "learning_rate": 1.475161272888471e-06, "loss": 0.2681, "step": 1278 }, { "epoch": 0.36, "grad_norm": 2.0860403283083073, "learning_rate": 1.4743535112756908e-06, "loss": 0.2544, "step": 1279 }, { "epoch": 0.36, "grad_norm": 2.392018732079389, "learning_rate": 1.4735453500973609e-06, "loss": 0.2836, "step": 1280 }, { "epoch": 0.36, "grad_norm": 2.23012471953278, "learning_rate": 1.4727367900342258e-06, "loss": 0.2767, "step": 1281 }, { "epoch": 0.36, "grad_norm": 2.272102233160078, "learning_rate": 1.4719278317673654e-06, "loss": 0.3001, "step": 1282 }, { "epoch": 0.36, "grad_norm": 2.39901218259605, "learning_rate": 1.4711184759781953e-06, "loss": 0.2574, "step": 1283 }, { "epoch": 0.36, "grad_norm": 2.5600352664145354, "learning_rate": 1.4703087233484659e-06, "loss": 0.3206, "step": 1284 }, { "epoch": 0.36, "grad_norm": 3.9198388613999184, "learning_rate": 1.469498574560262e-06, "loss": 0.3024, "step": 1285 }, { "epoch": 0.36, "grad_norm": 2.4103965434314567, "learning_rate": 1.4686880302960018e-06, "loss": 0.3014, "step": 1286 }, { "epoch": 0.36, "grad_norm": 2.209789038709257, "learning_rate": 1.4678770912384368e-06, "loss": 0.249, "step": 1287 }, { "epoch": 0.36, "grad_norm": 3.3618412034454384, "learning_rate": 1.467065758070651e-06, "loss": 0.2966, "step": 1288 }, { "epoch": 0.37, "grad_norm": 2.3019549434185347, "learning_rate": 1.4662540314760605e-06, "loss": 0.2788, "step": 1289 }, { "epoch": 0.37, "grad_norm": 2.4662661882754224, "learning_rate": 1.4654419121384126e-06, "loss": 0.2831, "step": 1290 }, { "epoch": 0.37, "grad_norm": 2.454442520748841, "learning_rate": 1.4646294007417856e-06, "loss": 0.302, "step": 1291 }, { "epoch": 0.37, "grad_norm": 2.340806467057121, "learning_rate": 1.463816497970588e-06, "loss": 0.329, "step": 1292 }, { "epoch": 0.37, "grad_norm": 2.449099525994602, "learning_rate": 1.4630032045095579e-06, "loss": 0.3047, "step": 1293 }, { "epoch": 0.37, "grad_norm": 2.6306339717083382, "learning_rate": 1.4621895210437625e-06, "loss": 0.3269, "step": 1294 }, { "epoch": 0.37, "grad_norm": 2.2548608028685004, "learning_rate": 1.4613754482585977e-06, "loss": 0.2985, "step": 1295 }, { "epoch": 0.37, "grad_norm": 2.37377254194116, "learning_rate": 1.4605609868397872e-06, "loss": 0.2989, "step": 1296 }, { "epoch": 0.37, "grad_norm": 2.2530252554578647, "learning_rate": 1.4597461374733815e-06, "loss": 0.3155, "step": 1297 }, { "epoch": 0.37, "grad_norm": 2.6798879482553373, "learning_rate": 1.4589309008457594e-06, "loss": 0.2738, "step": 1298 }, { "epoch": 0.37, "grad_norm": 2.4502889632471945, "learning_rate": 1.4581152776436238e-06, "loss": 0.3192, "step": 1299 }, { "epoch": 0.37, "grad_norm": 2.6298976641211786, "learning_rate": 1.4572992685540056e-06, "loss": 0.3167, "step": 1300 }, { "epoch": 0.37, "grad_norm": 2.3641713607691983, "learning_rate": 1.4564828742642583e-06, "loss": 0.289, "step": 1301 }, { "epoch": 0.37, "grad_norm": 2.367238462783568, "learning_rate": 1.455666095462062e-06, "loss": 0.2841, "step": 1302 }, { "epoch": 0.37, "grad_norm": 2.494597823178741, "learning_rate": 1.4548489328354194e-06, "loss": 0.3052, "step": 1303 }, { "epoch": 0.37, "grad_norm": 2.549571518755505, "learning_rate": 1.4540313870726568e-06, "loss": 0.3206, "step": 1304 }, { "epoch": 0.37, "grad_norm": 2.34529117759076, "learning_rate": 1.4532134588624233e-06, "loss": 0.2947, "step": 1305 }, { "epoch": 0.37, "grad_norm": 2.3384051528527823, "learning_rate": 1.4523951488936903e-06, "loss": 0.3108, "step": 1306 }, { "epoch": 0.37, "grad_norm": 2.223749125445581, "learning_rate": 1.451576457855751e-06, "loss": 0.2693, "step": 1307 }, { "epoch": 0.37, "grad_norm": 2.466587040514915, "learning_rate": 1.4507573864382186e-06, "loss": 0.2897, "step": 1308 }, { "epoch": 0.37, "grad_norm": 2.4013650330596032, "learning_rate": 1.4499379353310272e-06, "loss": 0.3095, "step": 1309 }, { "epoch": 0.37, "grad_norm": 2.329526517251253, "learning_rate": 1.4491181052244315e-06, "loss": 0.3053, "step": 1310 }, { "epoch": 0.37, "grad_norm": 2.2325386851804963, "learning_rate": 1.4482978968090043e-06, "loss": 0.2799, "step": 1311 }, { "epoch": 0.37, "grad_norm": 2.739730141975504, "learning_rate": 1.4474773107756378e-06, "loss": 0.2819, "step": 1312 }, { "epoch": 0.37, "grad_norm": 2.4059311840397424, "learning_rate": 1.446656347815542e-06, "loss": 0.3184, "step": 1313 }, { "epoch": 0.37, "grad_norm": 2.5043427586314624, "learning_rate": 1.4458350086202442e-06, "loss": 0.3358, "step": 1314 }, { "epoch": 0.37, "grad_norm": 2.5523646241136073, "learning_rate": 1.4450132938815893e-06, "loss": 0.324, "step": 1315 }, { "epoch": 0.37, "grad_norm": 2.6696043592655574, "learning_rate": 1.4441912042917378e-06, "loss": 0.3198, "step": 1316 }, { "epoch": 0.37, "grad_norm": 2.4038993670802613, "learning_rate": 1.4433687405431661e-06, "loss": 0.3094, "step": 1317 }, { "epoch": 0.37, "grad_norm": 2.2708458215823537, "learning_rate": 1.4425459033286663e-06, "loss": 0.2929, "step": 1318 }, { "epoch": 0.37, "grad_norm": 2.228885189442795, "learning_rate": 1.4417226933413445e-06, "loss": 0.2684, "step": 1319 }, { "epoch": 0.37, "grad_norm": 3.3560738300099113, "learning_rate": 1.4408991112746209e-06, "loss": 0.3066, "step": 1320 }, { "epoch": 0.37, "grad_norm": 2.3347511327984707, "learning_rate": 1.4400751578222293e-06, "loss": 0.2655, "step": 1321 }, { "epoch": 0.37, "grad_norm": 2.4636897945354117, "learning_rate": 1.4392508336782165e-06, "loss": 0.2594, "step": 1322 }, { "epoch": 0.37, "grad_norm": 2.3437567986546166, "learning_rate": 1.4384261395369405e-06, "loss": 0.2944, "step": 1323 }, { "epoch": 0.38, "grad_norm": 2.430515865063001, "learning_rate": 1.4376010760930727e-06, "loss": 0.3145, "step": 1324 }, { "epoch": 0.38, "grad_norm": 2.648183336932284, "learning_rate": 1.436775644041594e-06, "loss": 0.3171, "step": 1325 }, { "epoch": 0.38, "grad_norm": 2.1948854220827525, "learning_rate": 1.4359498440777969e-06, "loss": 0.2954, "step": 1326 }, { "epoch": 0.38, "grad_norm": 2.424356777798155, "learning_rate": 1.4351236768972827e-06, "loss": 0.295, "step": 1327 }, { "epoch": 0.38, "grad_norm": 2.3561806922810296, "learning_rate": 1.4342971431959633e-06, "loss": 0.2942, "step": 1328 }, { "epoch": 0.38, "grad_norm": 2.2325216270315376, "learning_rate": 1.4334702436700582e-06, "loss": 0.2743, "step": 1329 }, { "epoch": 0.38, "grad_norm": 2.388209633531802, "learning_rate": 1.4326429790160957e-06, "loss": 0.3036, "step": 1330 }, { "epoch": 0.38, "grad_norm": 2.518564073310508, "learning_rate": 1.4318153499309115e-06, "loss": 0.2809, "step": 1331 }, { "epoch": 0.38, "grad_norm": 2.3524718365011186, "learning_rate": 1.4309873571116484e-06, "loss": 0.2934, "step": 1332 }, { "epoch": 0.38, "grad_norm": 2.4008119110054373, "learning_rate": 1.4301590012557552e-06, "loss": 0.2948, "step": 1333 }, { "epoch": 0.38, "grad_norm": 2.657182854931203, "learning_rate": 1.4293302830609869e-06, "loss": 0.2982, "step": 1334 }, { "epoch": 0.38, "grad_norm": 2.4741787245997418, "learning_rate": 1.4285012032254033e-06, "loss": 0.3052, "step": 1335 }, { "epoch": 0.38, "grad_norm": 2.266535481706884, "learning_rate": 1.4276717624473695e-06, "loss": 0.2751, "step": 1336 }, { "epoch": 0.38, "grad_norm": 2.499984914973482, "learning_rate": 1.4268419614255543e-06, "loss": 0.3218, "step": 1337 }, { "epoch": 0.38, "grad_norm": 2.167267266773434, "learning_rate": 1.4260118008589293e-06, "loss": 0.2849, "step": 1338 }, { "epoch": 0.38, "grad_norm": 2.3809186742123614, "learning_rate": 1.42518128144677e-06, "loss": 0.3186, "step": 1339 }, { "epoch": 0.38, "grad_norm": 2.276436866987727, "learning_rate": 1.4243504038886528e-06, "loss": 0.2769, "step": 1340 }, { "epoch": 0.38, "grad_norm": 2.7771729871088713, "learning_rate": 1.4235191688844583e-06, "loss": 0.3143, "step": 1341 }, { "epoch": 0.38, "grad_norm": 2.5976626344842813, "learning_rate": 1.4226875771343654e-06, "loss": 0.3003, "step": 1342 }, { "epoch": 0.38, "grad_norm": 2.3154780781114837, "learning_rate": 1.4218556293388547e-06, "loss": 0.2912, "step": 1343 }, { "epoch": 0.38, "grad_norm": 2.3516866583648466, "learning_rate": 1.4210233261987069e-06, "loss": 0.2951, "step": 1344 }, { "epoch": 0.38, "grad_norm": 2.6133293177904156, "learning_rate": 1.4201906684150019e-06, "loss": 0.32, "step": 1345 }, { "epoch": 0.38, "grad_norm": 2.349221085671353, "learning_rate": 1.4193576566891179e-06, "loss": 0.2934, "step": 1346 }, { "epoch": 0.38, "grad_norm": 2.3372859553928187, "learning_rate": 1.418524291722732e-06, "loss": 0.2875, "step": 1347 }, { "epoch": 0.38, "grad_norm": 2.9969867158207175, "learning_rate": 1.4176905742178178e-06, "loss": 0.2858, "step": 1348 }, { "epoch": 0.38, "grad_norm": 2.469367670387917, "learning_rate": 1.4168565048766473e-06, "loss": 0.3216, "step": 1349 }, { "epoch": 0.38, "grad_norm": 2.433107326687296, "learning_rate": 1.4160220844017873e-06, "loss": 0.3011, "step": 1350 }, { "epoch": 0.38, "grad_norm": 2.44869944836565, "learning_rate": 1.4151873134961011e-06, "loss": 0.2823, "step": 1351 }, { "epoch": 0.38, "grad_norm": 2.3356654813249977, "learning_rate": 1.4143521928627477e-06, "loss": 0.2743, "step": 1352 }, { "epoch": 0.38, "grad_norm": 2.4618631831211477, "learning_rate": 1.41351672320518e-06, "loss": 0.3103, "step": 1353 }, { "epoch": 0.38, "grad_norm": 2.3728132538728643, "learning_rate": 1.4126809052271451e-06, "loss": 0.2894, "step": 1354 }, { "epoch": 0.38, "grad_norm": 2.1921637488839747, "learning_rate": 1.411844739632683e-06, "loss": 0.2991, "step": 1355 }, { "epoch": 0.38, "grad_norm": 2.2401465864522305, "learning_rate": 1.4110082271261277e-06, "loss": 0.2916, "step": 1356 }, { "epoch": 0.38, "grad_norm": 2.1374788781683236, "learning_rate": 1.410171368412104e-06, "loss": 0.27, "step": 1357 }, { "epoch": 0.38, "grad_norm": 2.2226597845776324, "learning_rate": 1.4093341641955296e-06, "loss": 0.313, "step": 1358 }, { "epoch": 0.39, "grad_norm": 2.2209502544829482, "learning_rate": 1.4084966151816122e-06, "loss": 0.3006, "step": 1359 }, { "epoch": 0.39, "grad_norm": 2.2293956956621597, "learning_rate": 1.4076587220758508e-06, "loss": 0.2945, "step": 1360 }, { "epoch": 0.39, "grad_norm": 2.469630659844368, "learning_rate": 1.4068204855840336e-06, "loss": 0.2862, "step": 1361 }, { "epoch": 0.39, "grad_norm": 2.7961374088455777, "learning_rate": 1.405981906412238e-06, "loss": 0.3437, "step": 1362 }, { "epoch": 0.39, "grad_norm": 2.422097199933655, "learning_rate": 1.4051429852668311e-06, "loss": 0.2894, "step": 1363 }, { "epoch": 0.39, "grad_norm": 2.6518743875649915, "learning_rate": 1.4043037228544665e-06, "loss": 0.3017, "step": 1364 }, { "epoch": 0.39, "grad_norm": 2.5012891328415634, "learning_rate": 1.4034641198820865e-06, "loss": 0.3169, "step": 1365 }, { "epoch": 0.39, "grad_norm": 2.3793187249527525, "learning_rate": 1.4026241770569196e-06, "loss": 0.2922, "step": 1366 }, { "epoch": 0.39, "grad_norm": 2.4635007290354376, "learning_rate": 1.4017838950864806e-06, "loss": 0.3042, "step": 1367 }, { "epoch": 0.39, "grad_norm": 2.4776118195788914, "learning_rate": 1.4009432746785709e-06, "loss": 0.2756, "step": 1368 }, { "epoch": 0.39, "grad_norm": 2.3652753400904305, "learning_rate": 1.4001023165412753e-06, "loss": 0.3089, "step": 1369 }, { "epoch": 0.39, "grad_norm": 2.4321634002156793, "learning_rate": 1.3992610213829648e-06, "loss": 0.3024, "step": 1370 }, { "epoch": 0.39, "grad_norm": 2.3618306833159903, "learning_rate": 1.3984193899122932e-06, "loss": 0.2753, "step": 1371 }, { "epoch": 0.39, "grad_norm": 2.462533985172125, "learning_rate": 1.3975774228381974e-06, "loss": 0.3393, "step": 1372 }, { "epoch": 0.39, "grad_norm": 2.3130478385308115, "learning_rate": 1.3967351208698982e-06, "loss": 0.2861, "step": 1373 }, { "epoch": 0.39, "grad_norm": 2.4529100901465632, "learning_rate": 1.3958924847168977e-06, "loss": 0.3029, "step": 1374 }, { "epoch": 0.39, "grad_norm": 2.424126592675069, "learning_rate": 1.3950495150889793e-06, "loss": 0.3359, "step": 1375 }, { "epoch": 0.39, "grad_norm": 2.3408333339200533, "learning_rate": 1.3942062126962075e-06, "loss": 0.2858, "step": 1376 }, { "epoch": 0.39, "grad_norm": 2.5154163908242535, "learning_rate": 1.3933625782489274e-06, "loss": 0.3013, "step": 1377 }, { "epoch": 0.39, "grad_norm": 2.1942197379970936, "learning_rate": 1.3925186124577637e-06, "loss": 0.2779, "step": 1378 }, { "epoch": 0.39, "grad_norm": 2.573288722683661, "learning_rate": 1.3916743160336195e-06, "loss": 0.2878, "step": 1379 }, { "epoch": 0.39, "grad_norm": 2.3597427022892417, "learning_rate": 1.3908296896876776e-06, "loss": 0.2953, "step": 1380 }, { "epoch": 0.39, "grad_norm": 2.6721873029691863, "learning_rate": 1.389984734131398e-06, "loss": 0.2873, "step": 1381 }, { "epoch": 0.39, "grad_norm": 3.1997325497798492, "learning_rate": 1.389139450076518e-06, "loss": 0.2886, "step": 1382 }, { "epoch": 0.39, "grad_norm": 2.829582003955937, "learning_rate": 1.388293838235051e-06, "loss": 0.2966, "step": 1383 }, { "epoch": 0.39, "grad_norm": 2.407857097243499, "learning_rate": 1.3874478993192885e-06, "loss": 0.2855, "step": 1384 }, { "epoch": 0.39, "grad_norm": 2.3886683252466736, "learning_rate": 1.3866016340417951e-06, "loss": 0.2789, "step": 1385 }, { "epoch": 0.39, "grad_norm": 2.5464389702961645, "learning_rate": 1.385755043115412e-06, "loss": 0.3101, "step": 1386 }, { "epoch": 0.39, "grad_norm": 2.316650930929904, "learning_rate": 1.3849081272532544e-06, "loss": 0.3025, "step": 1387 }, { "epoch": 0.39, "grad_norm": 2.4580772632167855, "learning_rate": 1.3840608871687102e-06, "loss": 0.2894, "step": 1388 }, { "epoch": 0.39, "grad_norm": 2.5338692072902522, "learning_rate": 1.3832133235754415e-06, "loss": 0.2948, "step": 1389 }, { "epoch": 0.39, "grad_norm": 2.417527611985879, "learning_rate": 1.3823654371873825e-06, "loss": 0.2873, "step": 1390 }, { "epoch": 0.39, "grad_norm": 2.4989649967624388, "learning_rate": 1.3815172287187393e-06, "loss": 0.2881, "step": 1391 }, { "epoch": 0.39, "grad_norm": 2.5528812441155635, "learning_rate": 1.3806686988839896e-06, "loss": 0.2971, "step": 1392 }, { "epoch": 0.39, "grad_norm": 2.5574316457528656, "learning_rate": 1.3798198483978813e-06, "loss": 0.2848, "step": 1393 }, { "epoch": 0.39, "grad_norm": 2.50493766439196, "learning_rate": 1.3789706779754324e-06, "loss": 0.308, "step": 1394 }, { "epoch": 0.4, "grad_norm": 2.2752842924157215, "learning_rate": 1.3781211883319312e-06, "loss": 0.2848, "step": 1395 }, { "epoch": 0.4, "grad_norm": 2.381552220394106, "learning_rate": 1.3772713801829336e-06, "loss": 0.2649, "step": 1396 }, { "epoch": 0.4, "grad_norm": 2.319700989497983, "learning_rate": 1.3764212542442655e-06, "loss": 0.2681, "step": 1397 }, { "epoch": 0.4, "grad_norm": 2.3685340252808373, "learning_rate": 1.3755708112320185e-06, "loss": 0.3195, "step": 1398 }, { "epoch": 0.4, "grad_norm": 2.356275959928643, "learning_rate": 1.3747200518625529e-06, "loss": 0.3112, "step": 1399 }, { "epoch": 0.4, "grad_norm": 2.4613648289880636, "learning_rate": 1.3738689768524944e-06, "loss": 0.2868, "step": 1400 }, { "epoch": 0.4, "grad_norm": 2.636826673880524, "learning_rate": 1.3730175869187356e-06, "loss": 0.3307, "step": 1401 }, { "epoch": 0.4, "grad_norm": 2.185141789325747, "learning_rate": 1.3721658827784333e-06, "loss": 0.277, "step": 1402 }, { "epoch": 0.4, "grad_norm": 2.284884901630232, "learning_rate": 1.37131386514901e-06, "loss": 0.2662, "step": 1403 }, { "epoch": 0.4, "grad_norm": 2.5056304950275905, "learning_rate": 1.370461534748151e-06, "loss": 0.2736, "step": 1404 }, { "epoch": 0.4, "grad_norm": 2.6112929606416824, "learning_rate": 1.3696088922938063e-06, "loss": 0.3041, "step": 1405 }, { "epoch": 0.4, "grad_norm": 2.518370343635079, "learning_rate": 1.3687559385041883e-06, "loss": 0.2924, "step": 1406 }, { "epoch": 0.4, "grad_norm": 2.452505799257892, "learning_rate": 1.3679026740977716e-06, "loss": 0.3219, "step": 1407 }, { "epoch": 0.4, "grad_norm": 2.3448228996764517, "learning_rate": 1.367049099793292e-06, "loss": 0.2965, "step": 1408 }, { "epoch": 0.4, "grad_norm": 2.5390555374689074, "learning_rate": 1.3661952163097472e-06, "loss": 0.2962, "step": 1409 }, { "epoch": 0.4, "grad_norm": 2.481322721652436, "learning_rate": 1.3653410243663951e-06, "loss": 0.2905, "step": 1410 }, { "epoch": 0.4, "grad_norm": 2.686838626482608, "learning_rate": 1.3644865246827527e-06, "loss": 0.3123, "step": 1411 }, { "epoch": 0.4, "grad_norm": 2.384974185059098, "learning_rate": 1.363631717978597e-06, "loss": 0.2874, "step": 1412 }, { "epoch": 0.4, "grad_norm": 2.5990639729189606, "learning_rate": 1.3627766049739633e-06, "loss": 0.3079, "step": 1413 }, { "epoch": 0.4, "grad_norm": 2.3712178342874344, "learning_rate": 1.3619211863891456e-06, "loss": 0.3038, "step": 1414 }, { "epoch": 0.4, "grad_norm": 2.524361510330734, "learning_rate": 1.3610654629446936e-06, "loss": 0.336, "step": 1415 }, { "epoch": 0.4, "grad_norm": 2.0728242696264614, "learning_rate": 1.3602094353614158e-06, "loss": 0.2532, "step": 1416 }, { "epoch": 0.4, "grad_norm": 2.3929540560511, "learning_rate": 1.3593531043603755e-06, "loss": 0.2928, "step": 1417 }, { "epoch": 0.4, "grad_norm": 2.2288066754224736, "learning_rate": 1.3584964706628921e-06, "loss": 0.2806, "step": 1418 }, { "epoch": 0.4, "grad_norm": 2.3635592025311642, "learning_rate": 1.35763953499054e-06, "loss": 0.2875, "step": 1419 }, { "epoch": 0.4, "grad_norm": 2.3988417976914302, "learning_rate": 1.356782298065148e-06, "loss": 0.2665, "step": 1420 }, { "epoch": 0.4, "grad_norm": 2.3297542920898744, "learning_rate": 1.3559247606087984e-06, "loss": 0.2878, "step": 1421 }, { "epoch": 0.4, "grad_norm": 2.326689143317108, "learning_rate": 1.355066923343827e-06, "loss": 0.2892, "step": 1422 }, { "epoch": 0.4, "grad_norm": 2.3010274507614974, "learning_rate": 1.3542087869928213e-06, "loss": 0.2607, "step": 1423 }, { "epoch": 0.4, "grad_norm": 2.6613557163218498, "learning_rate": 1.3533503522786223e-06, "loss": 0.3039, "step": 1424 }, { "epoch": 0.4, "grad_norm": 2.4466499629846106, "learning_rate": 1.3524916199243208e-06, "loss": 0.3095, "step": 1425 }, { "epoch": 0.4, "grad_norm": 2.547045510727559, "learning_rate": 1.351632590653259e-06, "loss": 0.28, "step": 1426 }, { "epoch": 0.4, "grad_norm": 2.2667498039894647, "learning_rate": 1.3507732651890291e-06, "loss": 0.2992, "step": 1427 }, { "epoch": 0.4, "grad_norm": 2.49008724136304, "learning_rate": 1.349913644255473e-06, "loss": 0.279, "step": 1428 }, { "epoch": 0.4, "grad_norm": 2.719503341700533, "learning_rate": 1.3490537285766808e-06, "loss": 0.2948, "step": 1429 }, { "epoch": 0.41, "grad_norm": 2.4092554173724507, "learning_rate": 1.3481935188769917e-06, "loss": 0.2996, "step": 1430 }, { "epoch": 0.41, "grad_norm": 2.4216625872037585, "learning_rate": 1.3473330158809924e-06, "loss": 0.282, "step": 1431 }, { "epoch": 0.41, "grad_norm": 2.1808475970040537, "learning_rate": 1.346472220313516e-06, "loss": 0.2887, "step": 1432 }, { "epoch": 0.41, "grad_norm": 2.3738790822075817, "learning_rate": 1.3456111328996428e-06, "loss": 0.2629, "step": 1433 }, { "epoch": 0.41, "grad_norm": 2.461742161075914, "learning_rate": 1.344749754364699e-06, "loss": 0.3071, "step": 1434 }, { "epoch": 0.41, "grad_norm": 2.458297651337914, "learning_rate": 1.343888085434255e-06, "loss": 0.3058, "step": 1435 }, { "epoch": 0.41, "grad_norm": 2.3528781827973866, "learning_rate": 1.343026126834127e-06, "loss": 0.2796, "step": 1436 }, { "epoch": 0.41, "grad_norm": 2.4443019851876113, "learning_rate": 1.3421638792903743e-06, "loss": 0.3051, "step": 1437 }, { "epoch": 0.41, "grad_norm": 2.4157361534282247, "learning_rate": 1.3413013435293002e-06, "loss": 0.3129, "step": 1438 }, { "epoch": 0.41, "grad_norm": 2.6256696001134663, "learning_rate": 1.3404385202774504e-06, "loss": 0.3422, "step": 1439 }, { "epoch": 0.41, "grad_norm": 2.494653539752748, "learning_rate": 1.3395754102616133e-06, "loss": 0.3017, "step": 1440 }, { "epoch": 0.41, "grad_norm": 2.533547772461133, "learning_rate": 1.338712014208818e-06, "loss": 0.3079, "step": 1441 }, { "epoch": 0.41, "grad_norm": 2.2545374483944847, "learning_rate": 1.3378483328463351e-06, "loss": 0.2881, "step": 1442 }, { "epoch": 0.41, "grad_norm": 2.2384615081575245, "learning_rate": 1.3369843669016756e-06, "loss": 0.2581, "step": 1443 }, { "epoch": 0.41, "grad_norm": 2.3973511771959712, "learning_rate": 1.33612011710259e-06, "loss": 0.2767, "step": 1444 }, { "epoch": 0.41, "grad_norm": 2.6139147945528984, "learning_rate": 1.335255584177068e-06, "loss": 0.3135, "step": 1445 }, { "epoch": 0.41, "grad_norm": 2.5360027379877477, "learning_rate": 1.3343907688533375e-06, "loss": 0.2835, "step": 1446 }, { "epoch": 0.41, "grad_norm": 2.338124638027161, "learning_rate": 1.333525671859865e-06, "loss": 0.2852, "step": 1447 }, { "epoch": 0.41, "grad_norm": 2.4812362246871094, "learning_rate": 1.332660293925353e-06, "loss": 0.3102, "step": 1448 }, { "epoch": 0.41, "grad_norm": 2.444894602358509, "learning_rate": 1.3317946357787424e-06, "loss": 0.302, "step": 1449 }, { "epoch": 0.41, "grad_norm": 2.3811549830187424, "learning_rate": 1.3309286981492082e-06, "loss": 0.2827, "step": 1450 }, { "epoch": 0.41, "grad_norm": 4.15688641411048, "learning_rate": 1.3300624817661626e-06, "loss": 0.2674, "step": 1451 }, { "epoch": 0.41, "grad_norm": 2.365390398206374, "learning_rate": 1.3291959873592507e-06, "loss": 0.2953, "step": 1452 }, { "epoch": 0.41, "grad_norm": 2.3143132163507927, "learning_rate": 1.328329215658354e-06, "loss": 0.2969, "step": 1453 }, { "epoch": 0.41, "grad_norm": 2.3680841714427667, "learning_rate": 1.327462167393586e-06, "loss": 0.2858, "step": 1454 }, { "epoch": 0.41, "grad_norm": 2.4333096082055556, "learning_rate": 1.3265948432952934e-06, "loss": 0.3119, "step": 1455 }, { "epoch": 0.41, "grad_norm": 2.3979577441652404, "learning_rate": 1.3257272440940556e-06, "loss": 0.2716, "step": 1456 }, { "epoch": 0.41, "grad_norm": 2.780283482227355, "learning_rate": 1.3248593705206837e-06, "loss": 0.3097, "step": 1457 }, { "epoch": 0.41, "grad_norm": 2.4727707069996874, "learning_rate": 1.3239912233062195e-06, "loss": 0.2949, "step": 1458 }, { "epoch": 0.41, "grad_norm": 2.605144018890558, "learning_rate": 1.3231228031819358e-06, "loss": 0.3038, "step": 1459 }, { "epoch": 0.41, "grad_norm": 2.30781291279654, "learning_rate": 1.322254110879335e-06, "loss": 0.3085, "step": 1460 }, { "epoch": 0.41, "grad_norm": 2.4557382603995013, "learning_rate": 1.321385147130149e-06, "loss": 0.3176, "step": 1461 }, { "epoch": 0.41, "grad_norm": 2.480506129867659, "learning_rate": 1.320515912666338e-06, "loss": 0.3305, "step": 1462 }, { "epoch": 0.41, "grad_norm": 2.5727163800612325, "learning_rate": 1.3196464082200901e-06, "loss": 0.2969, "step": 1463 }, { "epoch": 0.41, "grad_norm": 2.1255081785167396, "learning_rate": 1.318776634523822e-06, "loss": 0.2373, "step": 1464 }, { "epoch": 0.42, "grad_norm": 2.5116755100727595, "learning_rate": 1.3179065923101757e-06, "loss": 0.2783, "step": 1465 }, { "epoch": 0.42, "grad_norm": 2.2406852442419862, "learning_rate": 1.3170362823120202e-06, "loss": 0.2865, "step": 1466 }, { "epoch": 0.42, "grad_norm": 2.357655391327485, "learning_rate": 1.3161657052624496e-06, "loss": 0.2699, "step": 1467 }, { "epoch": 0.42, "grad_norm": 2.4125721643155313, "learning_rate": 1.3152948618947836e-06, "loss": 0.315, "step": 1468 }, { "epoch": 0.42, "grad_norm": 2.5164883344248845, "learning_rate": 1.3144237529425652e-06, "loss": 0.3061, "step": 1469 }, { "epoch": 0.42, "grad_norm": 2.3188123708680113, "learning_rate": 1.313552379139563e-06, "loss": 0.2862, "step": 1470 }, { "epoch": 0.42, "grad_norm": 2.3745493691138244, "learning_rate": 1.3126807412197664e-06, "loss": 0.3067, "step": 1471 }, { "epoch": 0.42, "grad_norm": 2.50456059069696, "learning_rate": 1.3118088399173886e-06, "loss": 0.2483, "step": 1472 }, { "epoch": 0.42, "grad_norm": 2.4560490015824437, "learning_rate": 1.3109366759668646e-06, "loss": 0.2723, "step": 1473 }, { "epoch": 0.42, "grad_norm": 2.5134191311678173, "learning_rate": 1.31006425010285e-06, "loss": 0.3037, "step": 1474 }, { "epoch": 0.42, "grad_norm": 2.3293536029879425, "learning_rate": 1.3091915630602222e-06, "loss": 0.2851, "step": 1475 }, { "epoch": 0.42, "grad_norm": 2.3190242087963773, "learning_rate": 1.308318615574077e-06, "loss": 0.2658, "step": 1476 }, { "epoch": 0.42, "grad_norm": 2.3274324770049466, "learning_rate": 1.3074454083797307e-06, "loss": 0.2773, "step": 1477 }, { "epoch": 0.42, "grad_norm": 2.4771135962414292, "learning_rate": 1.3065719422127185e-06, "loss": 0.302, "step": 1478 }, { "epoch": 0.42, "grad_norm": 2.3441652217311866, "learning_rate": 1.3056982178087933e-06, "loss": 0.291, "step": 1479 }, { "epoch": 0.42, "grad_norm": 2.5380690988427403, "learning_rate": 1.3048242359039247e-06, "loss": 0.3169, "step": 1480 }, { "epoch": 0.42, "grad_norm": 2.2385674340764274, "learning_rate": 1.303949997234301e-06, "loss": 0.2914, "step": 1481 }, { "epoch": 0.42, "grad_norm": 2.5364022580181365, "learning_rate": 1.3030755025363255e-06, "loss": 0.306, "step": 1482 }, { "epoch": 0.42, "grad_norm": 2.446531539254504, "learning_rate": 1.3022007525466179e-06, "loss": 0.336, "step": 1483 }, { "epoch": 0.42, "grad_norm": 2.4545408603051793, "learning_rate": 1.3013257480020114e-06, "loss": 0.2919, "step": 1484 }, { "epoch": 0.42, "grad_norm": 2.3489981204127623, "learning_rate": 1.3004504896395562e-06, "loss": 0.2909, "step": 1485 }, { "epoch": 0.42, "grad_norm": 3.3814888503667264, "learning_rate": 1.2995749781965136e-06, "loss": 0.2893, "step": 1486 }, { "epoch": 0.42, "grad_norm": 2.419910649136688, "learning_rate": 1.2986992144103606e-06, "loss": 0.2879, "step": 1487 }, { "epoch": 0.42, "grad_norm": 2.619557910339679, "learning_rate": 1.2978231990187847e-06, "loss": 0.2802, "step": 1488 }, { "epoch": 0.42, "grad_norm": 2.2204889549475744, "learning_rate": 1.2969469327596859e-06, "loss": 0.2897, "step": 1489 }, { "epoch": 0.42, "grad_norm": 2.38645753292471, "learning_rate": 1.2960704163711766e-06, "loss": 0.2963, "step": 1490 }, { "epoch": 0.42, "grad_norm": 2.47586048779542, "learning_rate": 1.2951936505915781e-06, "loss": 0.2775, "step": 1491 }, { "epoch": 0.42, "grad_norm": 2.406803415941365, "learning_rate": 1.294316636159424e-06, "loss": 0.2906, "step": 1492 }, { "epoch": 0.42, "grad_norm": 2.3808072486060525, "learning_rate": 1.2934393738134546e-06, "loss": 0.299, "step": 1493 }, { "epoch": 0.42, "grad_norm": 2.5512356401014133, "learning_rate": 1.2925618642926218e-06, "loss": 0.3172, "step": 1494 }, { "epoch": 0.42, "grad_norm": 2.160246048496873, "learning_rate": 1.2916841083360834e-06, "loss": 0.268, "step": 1495 }, { "epoch": 0.42, "grad_norm": 2.537746401420283, "learning_rate": 1.2908061066832063e-06, "loss": 0.2998, "step": 1496 }, { "epoch": 0.42, "grad_norm": 2.522608636766397, "learning_rate": 1.289927860073564e-06, "loss": 0.2946, "step": 1497 }, { "epoch": 0.42, "grad_norm": 2.371012661692452, "learning_rate": 1.2890493692469356e-06, "loss": 0.3024, "step": 1498 }, { "epoch": 0.42, "grad_norm": 2.450688539105884, "learning_rate": 1.2881706349433067e-06, "loss": 0.2836, "step": 1499 }, { "epoch": 0.43, "grad_norm": 2.4682318267739607, "learning_rate": 1.2872916579028684e-06, "loss": 0.3102, "step": 1500 }, { "epoch": 0.43, "grad_norm": 2.43757520685359, "learning_rate": 1.2864124388660146e-06, "loss": 0.2905, "step": 1501 }, { "epoch": 0.43, "grad_norm": 2.790709881138362, "learning_rate": 1.2855329785733452e-06, "loss": 0.3042, "step": 1502 }, { "epoch": 0.43, "grad_norm": 2.4479829710987713, "learning_rate": 1.2846532777656612e-06, "loss": 0.2789, "step": 1503 }, { "epoch": 0.43, "grad_norm": 2.375342138833745, "learning_rate": 1.2837733371839678e-06, "loss": 0.2797, "step": 1504 }, { "epoch": 0.43, "grad_norm": 2.2951167449970327, "learning_rate": 1.2828931575694718e-06, "loss": 0.2895, "step": 1505 }, { "epoch": 0.43, "grad_norm": 2.4220238351531425, "learning_rate": 1.2820127396635801e-06, "loss": 0.2767, "step": 1506 }, { "epoch": 0.43, "grad_norm": 2.390590180143157, "learning_rate": 1.2811320842079026e-06, "loss": 0.2845, "step": 1507 }, { "epoch": 0.43, "grad_norm": 2.227092280010057, "learning_rate": 1.2802511919442468e-06, "loss": 0.2727, "step": 1508 }, { "epoch": 0.43, "grad_norm": 2.9350175829138374, "learning_rate": 1.279370063614622e-06, "loss": 0.3165, "step": 1509 }, { "epoch": 0.43, "grad_norm": 2.3556655543461025, "learning_rate": 1.2784886999612347e-06, "loss": 0.2972, "step": 1510 }, { "epoch": 0.43, "grad_norm": 2.4145727803902792, "learning_rate": 1.2776071017264907e-06, "loss": 0.3229, "step": 1511 }, { "epoch": 0.43, "grad_norm": 2.2701438676974655, "learning_rate": 1.276725269652992e-06, "loss": 0.2972, "step": 1512 }, { "epoch": 0.43, "grad_norm": 2.279322336462779, "learning_rate": 1.275843204483539e-06, "loss": 0.2831, "step": 1513 }, { "epoch": 0.43, "grad_norm": 2.2549254803317544, "learning_rate": 1.274960906961128e-06, "loss": 0.2961, "step": 1514 }, { "epoch": 0.43, "grad_norm": 2.410217984272276, "learning_rate": 1.2740783778289505e-06, "loss": 0.3046, "step": 1515 }, { "epoch": 0.43, "grad_norm": 2.5502149712168185, "learning_rate": 1.273195617830394e-06, "loss": 0.2885, "step": 1516 }, { "epoch": 0.43, "grad_norm": 2.1287324402745083, "learning_rate": 1.2723126277090395e-06, "loss": 0.2714, "step": 1517 }, { "epoch": 0.43, "grad_norm": 2.5024724957081674, "learning_rate": 1.2714294082086627e-06, "loss": 0.3027, "step": 1518 }, { "epoch": 0.43, "grad_norm": 2.403173741361718, "learning_rate": 1.2705459600732317e-06, "loss": 0.3082, "step": 1519 }, { "epoch": 0.43, "grad_norm": 2.338355201173731, "learning_rate": 1.2696622840469081e-06, "loss": 0.2848, "step": 1520 }, { "epoch": 0.43, "grad_norm": 2.3525510876983664, "learning_rate": 1.2687783808740448e-06, "loss": 0.2715, "step": 1521 }, { "epoch": 0.43, "grad_norm": 2.280452801928687, "learning_rate": 1.2678942512991864e-06, "loss": 0.272, "step": 1522 }, { "epoch": 0.43, "grad_norm": 2.2859179747537994, "learning_rate": 1.2670098960670674e-06, "loss": 0.2747, "step": 1523 }, { "epoch": 0.43, "grad_norm": 2.303075929644119, "learning_rate": 1.2661253159226138e-06, "loss": 0.2929, "step": 1524 }, { "epoch": 0.43, "grad_norm": 2.3018200914697458, "learning_rate": 1.2652405116109393e-06, "loss": 0.3046, "step": 1525 }, { "epoch": 0.43, "grad_norm": 2.8220611835937754, "learning_rate": 1.2643554838773486e-06, "loss": 0.3014, "step": 1526 }, { "epoch": 0.43, "grad_norm": 2.3777860887932234, "learning_rate": 1.263470233467332e-06, "loss": 0.3024, "step": 1527 }, { "epoch": 0.43, "grad_norm": 2.397804821536525, "learning_rate": 1.26258476112657e-06, "loss": 0.3062, "step": 1528 }, { "epoch": 0.43, "grad_norm": 2.218905959145441, "learning_rate": 1.261699067600928e-06, "loss": 0.2794, "step": 1529 }, { "epoch": 0.43, "grad_norm": 2.463662208750161, "learning_rate": 1.2608131536364589e-06, "loss": 0.285, "step": 1530 }, { "epoch": 0.43, "grad_norm": 2.336920618973469, "learning_rate": 1.2599270199794006e-06, "loss": 0.2871, "step": 1531 }, { "epoch": 0.43, "grad_norm": 2.4511676946774426, "learning_rate": 1.259040667376176e-06, "loss": 0.2744, "step": 1532 }, { "epoch": 0.43, "grad_norm": 2.445524281717222, "learning_rate": 1.2581540965733939e-06, "loss": 0.3044, "step": 1533 }, { "epoch": 0.43, "grad_norm": 2.481605303885608, "learning_rate": 1.2572673083178447e-06, "loss": 0.2677, "step": 1534 }, { "epoch": 0.43, "grad_norm": 2.3038645377681823, "learning_rate": 1.2563803033565032e-06, "loss": 0.2978, "step": 1535 }, { "epoch": 0.44, "grad_norm": 2.521409954147198, "learning_rate": 1.255493082436527e-06, "loss": 0.3202, "step": 1536 }, { "epoch": 0.44, "grad_norm": 2.3247669770557176, "learning_rate": 1.2546056463052548e-06, "loss": 0.2797, "step": 1537 }, { "epoch": 0.44, "grad_norm": 2.284580868757689, "learning_rate": 1.2537179957102074e-06, "loss": 0.2746, "step": 1538 }, { "epoch": 0.44, "grad_norm": 3.3220609927468523, "learning_rate": 1.2528301313990853e-06, "loss": 0.2953, "step": 1539 }, { "epoch": 0.44, "grad_norm": 2.3514220186854273, "learning_rate": 1.2519420541197693e-06, "loss": 0.2994, "step": 1540 }, { "epoch": 0.44, "grad_norm": 2.6177756144523845, "learning_rate": 1.2510537646203207e-06, "loss": 0.2957, "step": 1541 }, { "epoch": 0.44, "grad_norm": 2.3045538164229953, "learning_rate": 1.2501652636489778e-06, "loss": 0.2742, "step": 1542 }, { "epoch": 0.44, "grad_norm": 2.5292210544986977, "learning_rate": 1.249276551954159e-06, "loss": 0.2926, "step": 1543 }, { "epoch": 0.44, "grad_norm": 2.397955383256873, "learning_rate": 1.2483876302844578e-06, "loss": 0.31, "step": 1544 }, { "epoch": 0.44, "grad_norm": 2.4427886972425847, "learning_rate": 1.2474984993886465e-06, "loss": 0.3102, "step": 1545 }, { "epoch": 0.44, "grad_norm": 2.494204157300474, "learning_rate": 1.2466091600156733e-06, "loss": 0.2729, "step": 1546 }, { "epoch": 0.44, "grad_norm": 2.332220076619459, "learning_rate": 1.2457196129146615e-06, "loss": 0.2446, "step": 1547 }, { "epoch": 0.44, "grad_norm": 2.2296539060987866, "learning_rate": 1.2448298588349096e-06, "loss": 0.2756, "step": 1548 }, { "epoch": 0.44, "grad_norm": 2.4852006453223505, "learning_rate": 1.2439398985258897e-06, "loss": 0.3134, "step": 1549 }, { "epoch": 0.44, "grad_norm": 2.4450105242836138, "learning_rate": 1.24304973273725e-06, "loss": 0.2894, "step": 1550 }, { "epoch": 0.44, "grad_norm": 2.1789569148926184, "learning_rate": 1.2421593622188086e-06, "loss": 0.2926, "step": 1551 }, { "epoch": 0.44, "grad_norm": 2.125580836250774, "learning_rate": 1.2412687877205585e-06, "loss": 0.2825, "step": 1552 }, { "epoch": 0.44, "grad_norm": 2.419826962755061, "learning_rate": 1.2403780099926633e-06, "loss": 0.2975, "step": 1553 }, { "epoch": 0.44, "grad_norm": 2.335424495673667, "learning_rate": 1.2394870297854581e-06, "loss": 0.2841, "step": 1554 }, { "epoch": 0.44, "grad_norm": 2.1971134981537985, "learning_rate": 1.2385958478494484e-06, "loss": 0.3257, "step": 1555 }, { "epoch": 0.44, "grad_norm": 2.6420619435757744, "learning_rate": 1.2377044649353102e-06, "loss": 0.3272, "step": 1556 }, { "epoch": 0.44, "grad_norm": 2.6924042663801266, "learning_rate": 1.2368128817938882e-06, "loss": 0.2752, "step": 1557 }, { "epoch": 0.44, "grad_norm": 2.3389639420764436, "learning_rate": 1.2359210991761956e-06, "loss": 0.2989, "step": 1558 }, { "epoch": 0.44, "grad_norm": 2.466397462305249, "learning_rate": 1.2350291178334144e-06, "loss": 0.2997, "step": 1559 }, { "epoch": 0.44, "grad_norm": 2.448220145119163, "learning_rate": 1.2341369385168935e-06, "loss": 0.2712, "step": 1560 }, { "epoch": 0.44, "grad_norm": 2.45491872341095, "learning_rate": 1.2332445619781489e-06, "loss": 0.2982, "step": 1561 }, { "epoch": 0.44, "grad_norm": 2.397589962456345, "learning_rate": 1.2323519889688614e-06, "loss": 0.2792, "step": 1562 }, { "epoch": 0.44, "grad_norm": 2.3142801661017898, "learning_rate": 1.2314592202408795e-06, "loss": 0.2755, "step": 1563 }, { "epoch": 0.44, "grad_norm": 2.527326221140043, "learning_rate": 1.2305662565462144e-06, "loss": 0.2994, "step": 1564 }, { "epoch": 0.44, "grad_norm": 2.3697314137556154, "learning_rate": 1.2296730986370436e-06, "loss": 0.2737, "step": 1565 }, { "epoch": 0.44, "grad_norm": 2.8244358950556676, "learning_rate": 1.2287797472657063e-06, "loss": 0.2652, "step": 1566 }, { "epoch": 0.44, "grad_norm": 2.5992271577632464, "learning_rate": 1.2278862031847059e-06, "loss": 0.3089, "step": 1567 }, { "epoch": 0.44, "grad_norm": 2.2422228745894803, "learning_rate": 1.2269924671467073e-06, "loss": 0.2793, "step": 1568 }, { "epoch": 0.44, "grad_norm": 2.6203714518020855, "learning_rate": 1.226098539904538e-06, "loss": 0.317, "step": 1569 }, { "epoch": 0.44, "grad_norm": 2.616453062567964, "learning_rate": 1.2252044222111857e-06, "loss": 0.2953, "step": 1570 }, { "epoch": 0.45, "grad_norm": 2.160665725191548, "learning_rate": 1.2243101148197989e-06, "loss": 0.2658, "step": 1571 }, { "epoch": 0.45, "grad_norm": 2.2669549608244024, "learning_rate": 1.223415618483686e-06, "loss": 0.2762, "step": 1572 }, { "epoch": 0.45, "grad_norm": 2.1309593576379817, "learning_rate": 1.2225209339563143e-06, "loss": 0.2691, "step": 1573 }, { "epoch": 0.45, "grad_norm": 2.502580046773122, "learning_rate": 1.22162606199131e-06, "loss": 0.3064, "step": 1574 }, { "epoch": 0.45, "grad_norm": 2.183455320755513, "learning_rate": 1.2207310033424566e-06, "loss": 0.2644, "step": 1575 }, { "epoch": 0.45, "grad_norm": 2.0960195827035477, "learning_rate": 1.2198357587636956e-06, "loss": 0.2653, "step": 1576 }, { "epoch": 0.45, "grad_norm": 2.8250176006360355, "learning_rate": 1.2189403290091244e-06, "loss": 0.3265, "step": 1577 }, { "epoch": 0.45, "grad_norm": 2.3678674723497974, "learning_rate": 1.218044714832997e-06, "loss": 0.2833, "step": 1578 }, { "epoch": 0.45, "grad_norm": 2.2569556247463827, "learning_rate": 1.2171489169897215e-06, "loss": 0.2895, "step": 1579 }, { "epoch": 0.45, "grad_norm": 2.812494247020307, "learning_rate": 1.2162529362338631e-06, "loss": 0.3053, "step": 1580 }, { "epoch": 0.45, "grad_norm": 2.2127372881951244, "learning_rate": 1.2153567733201383e-06, "loss": 0.2867, "step": 1581 }, { "epoch": 0.45, "grad_norm": 2.426303976237553, "learning_rate": 1.214460429003419e-06, "loss": 0.2647, "step": 1582 }, { "epoch": 0.45, "grad_norm": 2.39029073034326, "learning_rate": 1.213563904038729e-06, "loss": 0.3349, "step": 1583 }, { "epoch": 0.45, "grad_norm": 2.3123384990964833, "learning_rate": 1.2126671991812447e-06, "loss": 0.2903, "step": 1584 }, { "epoch": 0.45, "grad_norm": 2.3166156240565745, "learning_rate": 1.2117703151862939e-06, "loss": 0.2993, "step": 1585 }, { "epoch": 0.45, "grad_norm": 2.4677299405381614, "learning_rate": 1.2108732528093549e-06, "loss": 0.3073, "step": 1586 }, { "epoch": 0.45, "grad_norm": 2.4192821491062335, "learning_rate": 1.209976012806057e-06, "loss": 0.299, "step": 1587 }, { "epoch": 0.45, "grad_norm": 2.4357059330636432, "learning_rate": 1.2090785959321781e-06, "loss": 0.2646, "step": 1588 }, { "epoch": 0.45, "grad_norm": 5.522395881502279, "learning_rate": 1.2081810029436468e-06, "loss": 0.2937, "step": 1589 }, { "epoch": 0.45, "grad_norm": 2.321652531861596, "learning_rate": 1.207283234596538e-06, "loss": 0.3025, "step": 1590 }, { "epoch": 0.45, "grad_norm": 2.446193398181088, "learning_rate": 1.2063852916470753e-06, "loss": 0.3041, "step": 1591 }, { "epoch": 0.45, "grad_norm": 2.551171821649562, "learning_rate": 1.20548717485163e-06, "loss": 0.3013, "step": 1592 }, { "epoch": 0.45, "grad_norm": 2.2392588623142875, "learning_rate": 1.2045888849667185e-06, "loss": 0.2993, "step": 1593 }, { "epoch": 0.45, "grad_norm": 2.377514804296935, "learning_rate": 1.2036904227490041e-06, "loss": 0.3237, "step": 1594 }, { "epoch": 0.45, "grad_norm": 2.4836421658846763, "learning_rate": 1.202791788955295e-06, "loss": 0.2805, "step": 1595 }, { "epoch": 0.45, "grad_norm": 2.4812149257453022, "learning_rate": 1.2018929843425427e-06, "loss": 0.29, "step": 1596 }, { "epoch": 0.45, "grad_norm": 2.7616563865201336, "learning_rate": 1.200994009667845e-06, "loss": 0.2956, "step": 1597 }, { "epoch": 0.45, "grad_norm": 2.75907227360528, "learning_rate": 1.2000948656884407e-06, "loss": 0.3059, "step": 1598 }, { "epoch": 0.45, "grad_norm": 2.4106964339523826, "learning_rate": 1.1991955531617123e-06, "loss": 0.304, "step": 1599 }, { "epoch": 0.45, "grad_norm": 2.4572321494745086, "learning_rate": 1.1982960728451845e-06, "loss": 0.3002, "step": 1600 }, { "epoch": 0.45, "grad_norm": 2.3147931678781486, "learning_rate": 1.1973964254965223e-06, "loss": 0.2984, "step": 1601 }, { "epoch": 0.45, "grad_norm": 2.162842212080869, "learning_rate": 1.196496611873533e-06, "loss": 0.2708, "step": 1602 }, { "epoch": 0.45, "grad_norm": 2.439223299123657, "learning_rate": 1.1955966327341613e-06, "loss": 0.2816, "step": 1603 }, { "epoch": 0.45, "grad_norm": 2.270140418104315, "learning_rate": 1.1946964888364947e-06, "loss": 0.2834, "step": 1604 }, { "epoch": 0.45, "grad_norm": 2.2872796726584594, "learning_rate": 1.1937961809387567e-06, "loss": 0.2927, "step": 1605 }, { "epoch": 0.46, "grad_norm": 2.430768331384572, "learning_rate": 1.192895709799311e-06, "loss": 0.2615, "step": 1606 }, { "epoch": 0.46, "grad_norm": 2.5802386722827864, "learning_rate": 1.1919950761766567e-06, "loss": 0.2601, "step": 1607 }, { "epoch": 0.46, "grad_norm": 2.303315690496668, "learning_rate": 1.1910942808294313e-06, "loss": 0.3003, "step": 1608 }, { "epoch": 0.46, "grad_norm": 2.401745013844011, "learning_rate": 1.1901933245164084e-06, "loss": 0.2738, "step": 1609 }, { "epoch": 0.46, "grad_norm": 2.2269512971196703, "learning_rate": 1.189292207996497e-06, "loss": 0.2859, "step": 1610 }, { "epoch": 0.46, "grad_norm": 2.5530381476933846, "learning_rate": 1.1883909320287403e-06, "loss": 0.3276, "step": 1611 }, { "epoch": 0.46, "grad_norm": 2.552181480306632, "learning_rate": 1.1874894973723171e-06, "loss": 0.2945, "step": 1612 }, { "epoch": 0.46, "grad_norm": 2.4277394972003403, "learning_rate": 1.1865879047865389e-06, "loss": 0.3273, "step": 1613 }, { "epoch": 0.46, "grad_norm": 2.304486641989715, "learning_rate": 1.1856861550308506e-06, "loss": 0.2731, "step": 1614 }, { "epoch": 0.46, "grad_norm": 2.6788843226324546, "learning_rate": 1.1847842488648294e-06, "loss": 0.3032, "step": 1615 }, { "epoch": 0.46, "grad_norm": 2.3709010265119104, "learning_rate": 1.1838821870481846e-06, "loss": 0.2952, "step": 1616 }, { "epoch": 0.46, "grad_norm": 2.3812997004982766, "learning_rate": 1.1829799703407562e-06, "loss": 0.2854, "step": 1617 }, { "epoch": 0.46, "grad_norm": 2.4542133115233544, "learning_rate": 1.1820775995025146e-06, "loss": 0.3001, "step": 1618 }, { "epoch": 0.46, "grad_norm": 2.2291860735070586, "learning_rate": 1.1811750752935604e-06, "loss": 0.2829, "step": 1619 }, { "epoch": 0.46, "grad_norm": 2.3315720405872233, "learning_rate": 1.1802723984741227e-06, "loss": 0.2916, "step": 1620 }, { "epoch": 0.46, "grad_norm": 2.284866993118778, "learning_rate": 1.1793695698045605e-06, "loss": 0.2921, "step": 1621 }, { "epoch": 0.46, "grad_norm": 2.38603967727056, "learning_rate": 1.1784665900453592e-06, "loss": 0.3035, "step": 1622 }, { "epoch": 0.46, "grad_norm": 2.3086528982281633, "learning_rate": 1.1775634599571325e-06, "loss": 0.2809, "step": 1623 }, { "epoch": 0.46, "grad_norm": 2.355332925064751, "learning_rate": 1.1766601803006201e-06, "loss": 0.2996, "step": 1624 }, { "epoch": 0.46, "grad_norm": 2.897899160265012, "learning_rate": 1.1757567518366883e-06, "loss": 0.2976, "step": 1625 }, { "epoch": 0.46, "grad_norm": 2.366508867197264, "learning_rate": 1.174853175326328e-06, "loss": 0.2954, "step": 1626 }, { "epoch": 0.46, "grad_norm": 2.4458650264482884, "learning_rate": 1.1739494515306552e-06, "loss": 0.2743, "step": 1627 }, { "epoch": 0.46, "grad_norm": 3.878447090985886, "learning_rate": 1.17304558121091e-06, "loss": 0.2948, "step": 1628 }, { "epoch": 0.46, "grad_norm": 2.3919404467561116, "learning_rate": 1.1721415651284564e-06, "loss": 0.296, "step": 1629 }, { "epoch": 0.46, "grad_norm": 2.4843121301414333, "learning_rate": 1.1712374040447801e-06, "loss": 0.2939, "step": 1630 }, { "epoch": 0.46, "grad_norm": 2.443680664040836, "learning_rate": 1.1703330987214896e-06, "loss": 0.2894, "step": 1631 }, { "epoch": 0.46, "grad_norm": 2.4771653111875067, "learning_rate": 1.1694286499203147e-06, "loss": 0.3024, "step": 1632 }, { "epoch": 0.46, "grad_norm": 2.492661747284556, "learning_rate": 1.1685240584031067e-06, "loss": 0.2502, "step": 1633 }, { "epoch": 0.46, "grad_norm": 2.422281617887777, "learning_rate": 1.1676193249318358e-06, "loss": 0.2729, "step": 1634 }, { "epoch": 0.46, "grad_norm": 2.2196613564768586, "learning_rate": 1.166714450268593e-06, "loss": 0.261, "step": 1635 }, { "epoch": 0.46, "grad_norm": 2.410642664529523, "learning_rate": 1.165809435175588e-06, "loss": 0.2686, "step": 1636 }, { "epoch": 0.46, "grad_norm": 2.4144309428272397, "learning_rate": 1.164904280415148e-06, "loss": 0.2752, "step": 1637 }, { "epoch": 0.46, "grad_norm": 3.6694333385746716, "learning_rate": 1.163998986749719e-06, "loss": 0.2774, "step": 1638 }, { "epoch": 0.46, "grad_norm": 2.304174111118199, "learning_rate": 1.1630935549418626e-06, "loss": 0.2836, "step": 1639 }, { "epoch": 0.46, "grad_norm": 2.4381870026370938, "learning_rate": 1.1621879857542585e-06, "loss": 0.281, "step": 1640 }, { "epoch": 0.46, "grad_norm": 3.435408423357395, "learning_rate": 1.1612822799497005e-06, "loss": 0.2797, "step": 1641 }, { "epoch": 0.47, "grad_norm": 2.1859105113726507, "learning_rate": 1.1603764382910988e-06, "loss": 0.2905, "step": 1642 }, { "epoch": 0.47, "grad_norm": 2.478183699921075, "learning_rate": 1.1594704615414768e-06, "loss": 0.2683, "step": 1643 }, { "epoch": 0.47, "grad_norm": 2.3890470350734123, "learning_rate": 1.1585643504639725e-06, "loss": 0.2967, "step": 1644 }, { "epoch": 0.47, "grad_norm": 2.5253388824450242, "learning_rate": 1.1576581058218372e-06, "loss": 0.3048, "step": 1645 }, { "epoch": 0.47, "grad_norm": 2.295525160535059, "learning_rate": 1.1567517283784343e-06, "loss": 0.2689, "step": 1646 }, { "epoch": 0.47, "grad_norm": 2.4418315746970265, "learning_rate": 1.1558452188972384e-06, "loss": 0.2702, "step": 1647 }, { "epoch": 0.47, "grad_norm": 2.4257726301731095, "learning_rate": 1.154938578141837e-06, "loss": 0.2566, "step": 1648 }, { "epoch": 0.47, "grad_norm": 2.4880540461286063, "learning_rate": 1.1540318068759268e-06, "loss": 0.2707, "step": 1649 }, { "epoch": 0.47, "grad_norm": 2.180860676840754, "learning_rate": 1.1531249058633147e-06, "loss": 0.2975, "step": 1650 }, { "epoch": 0.47, "grad_norm": 2.3279730202534044, "learning_rate": 1.152217875867917e-06, "loss": 0.3109, "step": 1651 }, { "epoch": 0.47, "grad_norm": 2.2238195956771283, "learning_rate": 1.151310717653759e-06, "loss": 0.2528, "step": 1652 }, { "epoch": 0.47, "grad_norm": 2.443841308625575, "learning_rate": 1.150403431984974e-06, "loss": 0.3063, "step": 1653 }, { "epoch": 0.47, "grad_norm": 2.6189577221458014, "learning_rate": 1.1494960196258015e-06, "loss": 0.2925, "step": 1654 }, { "epoch": 0.47, "grad_norm": 2.382589203135882, "learning_rate": 1.1485884813405891e-06, "loss": 0.3003, "step": 1655 }, { "epoch": 0.47, "grad_norm": 2.4163619522350466, "learning_rate": 1.1476808178937898e-06, "loss": 0.3021, "step": 1656 }, { "epoch": 0.47, "grad_norm": 2.801855906017032, "learning_rate": 1.1467730300499624e-06, "loss": 0.2966, "step": 1657 }, { "epoch": 0.47, "grad_norm": 2.072560421336879, "learning_rate": 1.1458651185737702e-06, "loss": 0.2517, "step": 1658 }, { "epoch": 0.47, "grad_norm": 2.3776984976754605, "learning_rate": 1.1449570842299803e-06, "loss": 0.2902, "step": 1659 }, { "epoch": 0.47, "grad_norm": 2.272720150972345, "learning_rate": 1.1440489277834645e-06, "loss": 0.2967, "step": 1660 }, { "epoch": 0.47, "grad_norm": 3.1113527639819614, "learning_rate": 1.1431406499991953e-06, "loss": 0.2833, "step": 1661 }, { "epoch": 0.47, "grad_norm": 2.124494330550425, "learning_rate": 1.1422322516422505e-06, "loss": 0.2549, "step": 1662 }, { "epoch": 0.47, "grad_norm": 2.2187918477352206, "learning_rate": 1.1413237334778064e-06, "loss": 0.2724, "step": 1663 }, { "epoch": 0.47, "grad_norm": 2.338783727753721, "learning_rate": 1.1404150962711416e-06, "loss": 0.2768, "step": 1664 }, { "epoch": 0.47, "grad_norm": 2.268969273911241, "learning_rate": 1.1395063407876358e-06, "loss": 0.307, "step": 1665 }, { "epoch": 0.47, "grad_norm": 2.3939662990886927, "learning_rate": 1.1385974677927665e-06, "loss": 0.2903, "step": 1666 }, { "epoch": 0.47, "grad_norm": 2.3441718507444174, "learning_rate": 1.1376884780521116e-06, "loss": 0.2631, "step": 1667 }, { "epoch": 0.47, "grad_norm": 2.5386815434886256, "learning_rate": 1.1367793723313468e-06, "loss": 0.3002, "step": 1668 }, { "epoch": 0.47, "grad_norm": 2.9721204904382397, "learning_rate": 1.1358701513962454e-06, "loss": 0.2851, "step": 1669 }, { "epoch": 0.47, "grad_norm": 2.961021970694507, "learning_rate": 1.1349608160126783e-06, "loss": 0.3089, "step": 1670 }, { "epoch": 0.47, "grad_norm": 2.281055601400845, "learning_rate": 1.1340513669466119e-06, "loss": 0.2629, "step": 1671 }, { "epoch": 0.47, "grad_norm": 2.3781198642027723, "learning_rate": 1.133141804964109e-06, "loss": 0.3255, "step": 1672 }, { "epoch": 0.47, "grad_norm": 2.2183111482419444, "learning_rate": 1.1322321308313277e-06, "loss": 0.2706, "step": 1673 }, { "epoch": 0.47, "grad_norm": 2.251887744888779, "learning_rate": 1.13132234531452e-06, "loss": 0.2522, "step": 1674 }, { "epoch": 0.47, "grad_norm": 2.3499624185539907, "learning_rate": 1.130412449180032e-06, "loss": 0.306, "step": 1675 }, { "epoch": 0.47, "grad_norm": 2.428096073028077, "learning_rate": 1.1295024431943028e-06, "loss": 0.2778, "step": 1676 }, { "epoch": 0.48, "grad_norm": 2.2110536927365874, "learning_rate": 1.1285923281238646e-06, "loss": 0.2743, "step": 1677 }, { "epoch": 0.48, "grad_norm": 2.634455891084864, "learning_rate": 1.1276821047353401e-06, "loss": 0.3085, "step": 1678 }, { "epoch": 0.48, "grad_norm": 2.8355465814347784, "learning_rate": 1.1267717737954458e-06, "loss": 0.2992, "step": 1679 }, { "epoch": 0.48, "grad_norm": 2.3347658200547374, "learning_rate": 1.1258613360709858e-06, "loss": 0.2813, "step": 1680 }, { "epoch": 0.48, "grad_norm": 2.491899736342909, "learning_rate": 1.1249507923288561e-06, "loss": 0.2773, "step": 1681 }, { "epoch": 0.48, "grad_norm": 2.4424959895263525, "learning_rate": 1.1240401433360417e-06, "loss": 0.2641, "step": 1682 }, { "epoch": 0.48, "grad_norm": 2.425874752306347, "learning_rate": 1.1231293898596153e-06, "loss": 0.2927, "step": 1683 }, { "epoch": 0.48, "grad_norm": 2.373837340184451, "learning_rate": 1.1222185326667387e-06, "loss": 0.2625, "step": 1684 }, { "epoch": 0.48, "grad_norm": 2.329312562974794, "learning_rate": 1.121307572524661e-06, "loss": 0.2582, "step": 1685 }, { "epoch": 0.48, "grad_norm": 2.3640722972664494, "learning_rate": 1.1203965102007173e-06, "loss": 0.2603, "step": 1686 }, { "epoch": 0.48, "grad_norm": 2.396425754577803, "learning_rate": 1.1194853464623293e-06, "loss": 0.2633, "step": 1687 }, { "epoch": 0.48, "grad_norm": 2.381954406906367, "learning_rate": 1.118574082077004e-06, "loss": 0.2834, "step": 1688 }, { "epoch": 0.48, "grad_norm": 2.3566730696052085, "learning_rate": 1.117662717812333e-06, "loss": 0.2863, "step": 1689 }, { "epoch": 0.48, "grad_norm": 2.569134886147644, "learning_rate": 1.1167512544359927e-06, "loss": 0.3116, "step": 1690 }, { "epoch": 0.48, "grad_norm": 2.2502650418462724, "learning_rate": 1.115839692715742e-06, "loss": 0.2557, "step": 1691 }, { "epoch": 0.48, "grad_norm": 2.124319603820594, "learning_rate": 1.1149280334194235e-06, "loss": 0.2509, "step": 1692 }, { "epoch": 0.48, "grad_norm": 2.3197689998035793, "learning_rate": 1.114016277314961e-06, "loss": 0.2843, "step": 1693 }, { "epoch": 0.48, "grad_norm": 2.3081780964531617, "learning_rate": 1.1131044251703615e-06, "loss": 0.2712, "step": 1694 }, { "epoch": 0.48, "grad_norm": 2.6681397012058037, "learning_rate": 1.1121924777537107e-06, "loss": 0.2887, "step": 1695 }, { "epoch": 0.48, "grad_norm": 2.570603638430538, "learning_rate": 1.1112804358331765e-06, "loss": 0.2973, "step": 1696 }, { "epoch": 0.48, "grad_norm": 2.3482739754496023, "learning_rate": 1.1103683001770055e-06, "loss": 0.3073, "step": 1697 }, { "epoch": 0.48, "grad_norm": 2.5409530486963754, "learning_rate": 1.109456071553523e-06, "loss": 0.2721, "step": 1698 }, { "epoch": 0.48, "grad_norm": 2.515926218025391, "learning_rate": 1.1085437507311338e-06, "loss": 0.3055, "step": 1699 }, { "epoch": 0.48, "grad_norm": 2.297386792787777, "learning_rate": 1.1076313384783182e-06, "loss": 0.2715, "step": 1700 }, { "epoch": 0.48, "grad_norm": 2.2796020266024217, "learning_rate": 1.1067188355636366e-06, "loss": 0.2703, "step": 1701 }, { "epoch": 0.48, "grad_norm": 2.324970919155172, "learning_rate": 1.1058062427557228e-06, "loss": 0.2629, "step": 1702 }, { "epoch": 0.48, "grad_norm": 2.832132949007624, "learning_rate": 1.1048935608232878e-06, "loss": 0.3345, "step": 1703 }, { "epoch": 0.48, "grad_norm": 2.5880321537200777, "learning_rate": 1.1039807905351176e-06, "loss": 0.2845, "step": 1704 }, { "epoch": 0.48, "grad_norm": 2.444606820574314, "learning_rate": 1.1030679326600725e-06, "loss": 0.2943, "step": 1705 }, { "epoch": 0.48, "grad_norm": 2.216027659045018, "learning_rate": 1.1021549879670864e-06, "loss": 0.277, "step": 1706 }, { "epoch": 0.48, "grad_norm": 2.36616902049424, "learning_rate": 1.1012419572251663e-06, "loss": 0.2683, "step": 1707 }, { "epoch": 0.48, "grad_norm": 2.520596375978376, "learning_rate": 1.1003288412033923e-06, "loss": 0.2901, "step": 1708 }, { "epoch": 0.48, "grad_norm": 2.3720873886683127, "learning_rate": 1.0994156406709153e-06, "loss": 0.2916, "step": 1709 }, { "epoch": 0.48, "grad_norm": 2.550717392273682, "learning_rate": 1.0985023563969584e-06, "loss": 0.2813, "step": 1710 }, { "epoch": 0.48, "grad_norm": 2.2948039992926934, "learning_rate": 1.0975889891508147e-06, "loss": 0.2802, "step": 1711 }, { "epoch": 0.49, "grad_norm": 2.302247130619214, "learning_rate": 1.0966755397018472e-06, "loss": 0.2835, "step": 1712 }, { "epoch": 0.49, "grad_norm": 2.1751534784301647, "learning_rate": 1.0957620088194883e-06, "loss": 0.2757, "step": 1713 }, { "epoch": 0.49, "grad_norm": 2.4138394233404137, "learning_rate": 1.0948483972732395e-06, "loss": 0.2824, "step": 1714 }, { "epoch": 0.49, "grad_norm": 2.203928367452892, "learning_rate": 1.0939347058326681e-06, "loss": 0.2812, "step": 1715 }, { "epoch": 0.49, "grad_norm": 2.388362231374219, "learning_rate": 1.0930209352674123e-06, "loss": 0.3166, "step": 1716 }, { "epoch": 0.49, "grad_norm": 2.3369407790264196, "learning_rate": 1.0921070863471732e-06, "loss": 0.2883, "step": 1717 }, { "epoch": 0.49, "grad_norm": 2.463710226139802, "learning_rate": 1.0911931598417209e-06, "loss": 0.2929, "step": 1718 }, { "epoch": 0.49, "grad_norm": 2.5950571987671425, "learning_rate": 1.0902791565208886e-06, "loss": 0.2898, "step": 1719 }, { "epoch": 0.49, "grad_norm": 2.4638825288289783, "learning_rate": 1.0893650771545756e-06, "loss": 0.2853, "step": 1720 }, { "epoch": 0.49, "grad_norm": 2.4506770789310464, "learning_rate": 1.0884509225127451e-06, "loss": 0.3009, "step": 1721 }, { "epoch": 0.49, "grad_norm": 2.2545658493991563, "learning_rate": 1.0875366933654231e-06, "loss": 0.2552, "step": 1722 }, { "epoch": 0.49, "grad_norm": 2.568482399258855, "learning_rate": 1.0866223904826989e-06, "loss": 0.256, "step": 1723 }, { "epoch": 0.49, "grad_norm": 2.3860221317855297, "learning_rate": 1.0857080146347236e-06, "loss": 0.2599, "step": 1724 }, { "epoch": 0.49, "grad_norm": 2.324595881561982, "learning_rate": 1.0847935665917098e-06, "loss": 0.2739, "step": 1725 }, { "epoch": 0.49, "grad_norm": 2.8118642447032007, "learning_rate": 1.0838790471239311e-06, "loss": 0.3026, "step": 1726 }, { "epoch": 0.49, "grad_norm": 2.384500504570064, "learning_rate": 1.0829644570017211e-06, "loss": 0.2669, "step": 1727 }, { "epoch": 0.49, "grad_norm": 2.523962612237143, "learning_rate": 1.0820497969954731e-06, "loss": 0.295, "step": 1728 }, { "epoch": 0.49, "grad_norm": 2.281525461610133, "learning_rate": 1.0811350678756391e-06, "loss": 0.2788, "step": 1729 }, { "epoch": 0.49, "grad_norm": 2.2065386887892777, "learning_rate": 1.0802202704127292e-06, "loss": 0.2514, "step": 1730 }, { "epoch": 0.49, "grad_norm": 2.335004256597714, "learning_rate": 1.0793054053773117e-06, "loss": 0.2886, "step": 1731 }, { "epoch": 0.49, "grad_norm": 2.422474441053778, "learning_rate": 1.0783904735400102e-06, "loss": 0.2989, "step": 1732 }, { "epoch": 0.49, "grad_norm": 2.3667128250142393, "learning_rate": 1.0774754756715071e-06, "loss": 0.2674, "step": 1733 }, { "epoch": 0.49, "grad_norm": 2.4721285965550446, "learning_rate": 1.0765604125425381e-06, "loss": 0.275, "step": 1734 }, { "epoch": 0.49, "grad_norm": 2.6296653925230213, "learning_rate": 1.0756452849238953e-06, "loss": 0.2941, "step": 1735 }, { "epoch": 0.49, "grad_norm": 2.3439950039178763, "learning_rate": 1.0747300935864243e-06, "loss": 0.2858, "step": 1736 }, { "epoch": 0.49, "grad_norm": 2.417870316163827, "learning_rate": 1.0738148393010249e-06, "loss": 0.2902, "step": 1737 }, { "epoch": 0.49, "grad_norm": 2.3262702827912864, "learning_rate": 1.0728995228386495e-06, "loss": 0.275, "step": 1738 }, { "epoch": 0.49, "grad_norm": 2.58841745004525, "learning_rate": 1.0719841449703033e-06, "loss": 0.2811, "step": 1739 }, { "epoch": 0.49, "grad_norm": 2.380777299893438, "learning_rate": 1.071068706467043e-06, "loss": 0.2861, "step": 1740 }, { "epoch": 0.49, "grad_norm": 2.4556940889456893, "learning_rate": 1.070153208099976e-06, "loss": 0.3134, "step": 1741 }, { "epoch": 0.49, "grad_norm": 2.6412055647062096, "learning_rate": 1.0692376506402613e-06, "loss": 0.3369, "step": 1742 }, { "epoch": 0.49, "grad_norm": 2.560785153064596, "learning_rate": 1.068322034859106e-06, "loss": 0.29, "step": 1743 }, { "epoch": 0.49, "grad_norm": 2.531656496883173, "learning_rate": 1.067406361527768e-06, "loss": 0.2713, "step": 1744 }, { "epoch": 0.49, "grad_norm": 2.5381548769556077, "learning_rate": 1.0664906314175524e-06, "loss": 0.305, "step": 1745 }, { "epoch": 0.49, "grad_norm": 2.5206232944748304, "learning_rate": 1.0655748452998127e-06, "loss": 0.2925, "step": 1746 }, { "epoch": 0.5, "grad_norm": 2.592877659877174, "learning_rate": 1.0646590039459499e-06, "loss": 0.3254, "step": 1747 }, { "epoch": 0.5, "grad_norm": 2.415535301875715, "learning_rate": 1.0637431081274107e-06, "loss": 0.2762, "step": 1748 }, { "epoch": 0.5, "grad_norm": 2.450107923002262, "learning_rate": 1.0628271586156878e-06, "loss": 0.2704, "step": 1749 }, { "epoch": 0.5, "grad_norm": 2.414014331879564, "learning_rate": 1.0619111561823206e-06, "loss": 0.2876, "step": 1750 }, { "epoch": 0.5, "grad_norm": 2.3011402092648536, "learning_rate": 1.0609951015988904e-06, "loss": 0.2916, "step": 1751 }, { "epoch": 0.5, "grad_norm": 2.296789517012798, "learning_rate": 1.0600789956370253e-06, "loss": 0.285, "step": 1752 }, { "epoch": 0.5, "grad_norm": 2.1437783622474975, "learning_rate": 1.0591628390683945e-06, "loss": 0.259, "step": 1753 }, { "epoch": 0.5, "grad_norm": 2.396711139395835, "learning_rate": 1.0582466326647109e-06, "loss": 0.2865, "step": 1754 }, { "epoch": 0.5, "grad_norm": 2.411830911114145, "learning_rate": 1.0573303771977288e-06, "loss": 0.2833, "step": 1755 }, { "epoch": 0.5, "grad_norm": 2.40099593001122, "learning_rate": 1.0564140734392445e-06, "loss": 0.2596, "step": 1756 }, { "epoch": 0.5, "grad_norm": 2.3296913537453428, "learning_rate": 1.0554977221610948e-06, "loss": 0.2708, "step": 1757 }, { "epoch": 0.5, "grad_norm": 2.536138383027427, "learning_rate": 1.0545813241351558e-06, "loss": 0.2789, "step": 1758 }, { "epoch": 0.5, "grad_norm": 2.3747634534579163, "learning_rate": 1.053664880133344e-06, "loss": 0.2982, "step": 1759 }, { "epoch": 0.5, "grad_norm": 2.5618665119525854, "learning_rate": 1.0527483909276142e-06, "loss": 0.317, "step": 1760 }, { "epoch": 0.5, "grad_norm": 2.4399883429176024, "learning_rate": 1.051831857289959e-06, "loss": 0.2937, "step": 1761 }, { "epoch": 0.5, "grad_norm": 2.466100915953789, "learning_rate": 1.0509152799924084e-06, "loss": 0.3105, "step": 1762 }, { "epoch": 0.5, "grad_norm": 2.2869814441583154, "learning_rate": 1.0499986598070301e-06, "loss": 0.2699, "step": 1763 }, { "epoch": 0.5, "grad_norm": 2.3649098477306936, "learning_rate": 1.0490819975059267e-06, "loss": 0.2624, "step": 1764 }, { "epoch": 0.5, "grad_norm": 2.2203716949643275, "learning_rate": 1.0481652938612372e-06, "loss": 0.2749, "step": 1765 }, { "epoch": 0.5, "grad_norm": 2.218764342181374, "learning_rate": 1.0472485496451347e-06, "loss": 0.2587, "step": 1766 }, { "epoch": 0.5, "grad_norm": 2.1979106434585445, "learning_rate": 1.0463317656298272e-06, "loss": 0.2564, "step": 1767 }, { "epoch": 0.5, "grad_norm": 2.598649617483703, "learning_rate": 1.0454149425875558e-06, "loss": 0.307, "step": 1768 }, { "epoch": 0.5, "grad_norm": 2.4256848758871676, "learning_rate": 1.0444980812905944e-06, "loss": 0.2598, "step": 1769 }, { "epoch": 0.5, "grad_norm": 2.799918173511259, "learning_rate": 1.0435811825112496e-06, "loss": 0.296, "step": 1770 }, { "epoch": 0.5, "grad_norm": 2.825152276633744, "learning_rate": 1.0426642470218585e-06, "loss": 0.2572, "step": 1771 }, { "epoch": 0.5, "grad_norm": 2.2875491695406134, "learning_rate": 1.0417472755947908e-06, "loss": 0.2697, "step": 1772 }, { "epoch": 0.5, "grad_norm": 2.338928267426616, "learning_rate": 1.0408302690024446e-06, "loss": 0.2905, "step": 1773 }, { "epoch": 0.5, "grad_norm": 2.377571283990241, "learning_rate": 1.0399132280172493e-06, "loss": 0.2888, "step": 1774 }, { "epoch": 0.5, "grad_norm": 2.494627386822755, "learning_rate": 1.038996153411662e-06, "loss": 0.3092, "step": 1775 }, { "epoch": 0.5, "grad_norm": 2.4803657304664886, "learning_rate": 1.0380790459581694e-06, "loss": 0.2933, "step": 1776 }, { "epoch": 0.5, "grad_norm": 2.599391249742488, "learning_rate": 1.0371619064292842e-06, "loss": 0.2987, "step": 1777 }, { "epoch": 0.5, "grad_norm": 2.4867542348412894, "learning_rate": 1.0362447355975475e-06, "loss": 0.2618, "step": 1778 }, { "epoch": 0.5, "grad_norm": 2.345277800331825, "learning_rate": 1.0353275342355262e-06, "loss": 0.2586, "step": 1779 }, { "epoch": 0.5, "grad_norm": 2.2953003810671726, "learning_rate": 1.034410303115813e-06, "loss": 0.2999, "step": 1780 }, { "epoch": 0.5, "grad_norm": 2.40069451279197, "learning_rate": 1.0334930430110256e-06, "loss": 0.2897, "step": 1781 }, { "epoch": 0.5, "grad_norm": 2.4034742727959304, "learning_rate": 1.0325757546938066e-06, "loss": 0.3252, "step": 1782 }, { "epoch": 0.51, "grad_norm": 2.6249771671947753, "learning_rate": 1.0316584389368212e-06, "loss": 0.2917, "step": 1783 }, { "epoch": 0.51, "grad_norm": 2.521429682335649, "learning_rate": 1.0307410965127594e-06, "loss": 0.266, "step": 1784 }, { "epoch": 0.51, "grad_norm": 2.4288395224203296, "learning_rate": 1.029823728194332e-06, "loss": 0.2769, "step": 1785 }, { "epoch": 0.51, "grad_norm": 2.278568446029088, "learning_rate": 1.0289063347542726e-06, "loss": 0.2921, "step": 1786 }, { "epoch": 0.51, "grad_norm": 2.3818620321327395, "learning_rate": 1.0279889169653359e-06, "loss": 0.2805, "step": 1787 }, { "epoch": 0.51, "grad_norm": 2.438023465912851, "learning_rate": 1.0270714756002965e-06, "loss": 0.3057, "step": 1788 }, { "epoch": 0.51, "grad_norm": 2.2677418993032314, "learning_rate": 1.0261540114319497e-06, "loss": 0.3, "step": 1789 }, { "epoch": 0.51, "grad_norm": 2.204566597722765, "learning_rate": 1.0252365252331092e-06, "loss": 0.2801, "step": 1790 }, { "epoch": 0.51, "grad_norm": 2.4901371783556514, "learning_rate": 1.0243190177766084e-06, "loss": 0.2966, "step": 1791 }, { "epoch": 0.51, "grad_norm": 2.3665442696463894, "learning_rate": 1.0234014898352965e-06, "loss": 0.2915, "step": 1792 }, { "epoch": 0.51, "grad_norm": 2.3177504690621924, "learning_rate": 1.0224839421820426e-06, "loss": 0.2512, "step": 1793 }, { "epoch": 0.51, "grad_norm": 2.423196847728315, "learning_rate": 1.0215663755897306e-06, "loss": 0.2942, "step": 1794 }, { "epoch": 0.51, "grad_norm": 2.3565389077019434, "learning_rate": 1.0206487908312607e-06, "loss": 0.2896, "step": 1795 }, { "epoch": 0.51, "grad_norm": 2.5055170626548535, "learning_rate": 1.0197311886795485e-06, "loss": 0.2973, "step": 1796 }, { "epoch": 0.51, "grad_norm": 2.303580120594449, "learning_rate": 1.018813569907525e-06, "loss": 0.2792, "step": 1797 }, { "epoch": 0.51, "grad_norm": 2.2318800840205935, "learning_rate": 1.0178959352881335e-06, "loss": 0.2664, "step": 1798 }, { "epoch": 0.51, "grad_norm": 2.404582126143902, "learning_rate": 1.0169782855943326e-06, "loss": 0.28, "step": 1799 }, { "epoch": 0.51, "grad_norm": 2.3223672277058265, "learning_rate": 1.016060621599092e-06, "loss": 0.2784, "step": 1800 }, { "epoch": 0.51, "grad_norm": 2.211504993025335, "learning_rate": 1.0151429440753948e-06, "loss": 0.2583, "step": 1801 }, { "epoch": 0.51, "grad_norm": 2.1652312860171645, "learning_rate": 1.0142252537962338e-06, "loss": 0.2483, "step": 1802 }, { "epoch": 0.51, "grad_norm": 2.5690040507347525, "learning_rate": 1.0133075515346147e-06, "loss": 0.3247, "step": 1803 }, { "epoch": 0.51, "grad_norm": 2.3512850501355937, "learning_rate": 1.0123898380635514e-06, "loss": 0.2846, "step": 1804 }, { "epoch": 0.51, "grad_norm": 2.5307997781444462, "learning_rate": 1.0114721141560678e-06, "loss": 0.3021, "step": 1805 }, { "epoch": 0.51, "grad_norm": 2.3755600706842928, "learning_rate": 1.0105543805851975e-06, "loss": 0.2707, "step": 1806 }, { "epoch": 0.51, "grad_norm": 2.510828516893168, "learning_rate": 1.0096366381239806e-06, "loss": 0.2939, "step": 1807 }, { "epoch": 0.51, "grad_norm": 2.2898278280269504, "learning_rate": 1.0087188875454668e-06, "loss": 0.2451, "step": 1808 }, { "epoch": 0.51, "grad_norm": 2.449184623548092, "learning_rate": 1.0078011296227103e-06, "loss": 0.2667, "step": 1809 }, { "epoch": 0.51, "grad_norm": 2.4197885773422607, "learning_rate": 1.0068833651287733e-06, "loss": 0.2951, "step": 1810 }, { "epoch": 0.51, "grad_norm": 2.536061699800912, "learning_rate": 1.0059655948367228e-06, "loss": 0.2738, "step": 1811 }, { "epoch": 0.51, "grad_norm": 2.29823211800711, "learning_rate": 1.0050478195196302e-06, "loss": 0.3026, "step": 1812 }, { "epoch": 0.51, "grad_norm": 2.131609519143086, "learning_rate": 1.0041300399505724e-06, "loss": 0.2666, "step": 1813 }, { "epoch": 0.51, "grad_norm": 2.2837693911189096, "learning_rate": 1.0032122569026281e-06, "loss": 0.2944, "step": 1814 }, { "epoch": 0.51, "grad_norm": 2.382827289671159, "learning_rate": 1.0022944711488816e-06, "loss": 0.2907, "step": 1815 }, { "epoch": 0.51, "grad_norm": 2.3398481320326656, "learning_rate": 1.0013766834624167e-06, "loss": 0.2928, "step": 1816 }, { "epoch": 0.51, "grad_norm": 2.3567810281368526, "learning_rate": 1.0004588946163202e-06, "loss": 0.278, "step": 1817 }, { "epoch": 0.52, "grad_norm": 2.3774697747428126, "learning_rate": 9.995411053836797e-07, "loss": 0.317, "step": 1818 }, { "epoch": 0.52, "grad_norm": 2.5589790947949296, "learning_rate": 9.986233165375836e-07, "loss": 0.2779, "step": 1819 }, { "epoch": 0.52, "grad_norm": 2.408172535398655, "learning_rate": 9.977055288511181e-07, "loss": 0.2812, "step": 1820 }, { "epoch": 0.52, "grad_norm": 3.1184684834613012, "learning_rate": 9.967877430973716e-07, "loss": 0.296, "step": 1821 }, { "epoch": 0.52, "grad_norm": 2.3555209555113175, "learning_rate": 9.958699600494277e-07, "loss": 0.3002, "step": 1822 }, { "epoch": 0.52, "grad_norm": 2.2792496340161654, "learning_rate": 9.949521804803697e-07, "loss": 0.2675, "step": 1823 }, { "epoch": 0.52, "grad_norm": 2.444601373370277, "learning_rate": 9.940344051632776e-07, "loss": 0.2925, "step": 1824 }, { "epoch": 0.52, "grad_norm": 2.4852374720528982, "learning_rate": 9.931166348712266e-07, "loss": 0.2627, "step": 1825 }, { "epoch": 0.52, "grad_norm": 2.7780433494306975, "learning_rate": 9.921988703772896e-07, "loss": 0.3006, "step": 1826 }, { "epoch": 0.52, "grad_norm": 2.446401074333341, "learning_rate": 9.912811124545332e-07, "loss": 0.2667, "step": 1827 }, { "epoch": 0.52, "grad_norm": 2.3409932053933957, "learning_rate": 9.903633618760193e-07, "loss": 0.2746, "step": 1828 }, { "epoch": 0.52, "grad_norm": 2.407708241012865, "learning_rate": 9.894456194148028e-07, "loss": 0.2602, "step": 1829 }, { "epoch": 0.52, "grad_norm": 2.22125022141639, "learning_rate": 9.885278858439321e-07, "loss": 0.2698, "step": 1830 }, { "epoch": 0.52, "grad_norm": 2.6680271799964284, "learning_rate": 9.876101619364487e-07, "loss": 0.2795, "step": 1831 }, { "epoch": 0.52, "grad_norm": 2.4811890516289075, "learning_rate": 9.866924484653855e-07, "loss": 0.2979, "step": 1832 }, { "epoch": 0.52, "grad_norm": 2.2735370919447107, "learning_rate": 9.85774746203766e-07, "loss": 0.2601, "step": 1833 }, { "epoch": 0.52, "grad_norm": 2.3429662880966524, "learning_rate": 9.848570559246053e-07, "loss": 0.2813, "step": 1834 }, { "epoch": 0.52, "grad_norm": 2.449821577681452, "learning_rate": 9.839393784009076e-07, "loss": 0.2968, "step": 1835 }, { "epoch": 0.52, "grad_norm": 2.302224969895475, "learning_rate": 9.830217144056673e-07, "loss": 0.2868, "step": 1836 }, { "epoch": 0.52, "grad_norm": 3.6831658109868886, "learning_rate": 9.821040647118664e-07, "loss": 0.2874, "step": 1837 }, { "epoch": 0.52, "grad_norm": 2.3434640511937035, "learning_rate": 9.811864300924752e-07, "loss": 0.2616, "step": 1838 }, { "epoch": 0.52, "grad_norm": 2.337482263937847, "learning_rate": 9.802688113204516e-07, "loss": 0.2761, "step": 1839 }, { "epoch": 0.52, "grad_norm": 3.089170981209531, "learning_rate": 9.793512091687394e-07, "loss": 0.2862, "step": 1840 }, { "epoch": 0.52, "grad_norm": 2.4252580066175775, "learning_rate": 9.784336244102695e-07, "loss": 0.3009, "step": 1841 }, { "epoch": 0.52, "grad_norm": 2.259680761735452, "learning_rate": 9.775160578179573e-07, "loss": 0.297, "step": 1842 }, { "epoch": 0.52, "grad_norm": 2.325015796073353, "learning_rate": 9.765985101647034e-07, "loss": 0.3074, "step": 1843 }, { "epoch": 0.52, "grad_norm": 2.298833983394814, "learning_rate": 9.75680982223392e-07, "loss": 0.2627, "step": 1844 }, { "epoch": 0.52, "grad_norm": 2.5713981868206135, "learning_rate": 9.747634747668905e-07, "loss": 0.2899, "step": 1845 }, { "epoch": 0.52, "grad_norm": 2.419747640232926, "learning_rate": 9.738459885680502e-07, "loss": 0.283, "step": 1846 }, { "epoch": 0.52, "grad_norm": 2.355604327545243, "learning_rate": 9.729285243997036e-07, "loss": 0.2893, "step": 1847 }, { "epoch": 0.52, "grad_norm": 2.8604074410743534, "learning_rate": 9.720110830346642e-07, "loss": 0.2752, "step": 1848 }, { "epoch": 0.52, "grad_norm": 2.351280307326712, "learning_rate": 9.710936652457275e-07, "loss": 0.2744, "step": 1849 }, { "epoch": 0.52, "grad_norm": 2.392326574811654, "learning_rate": 9.70176271805668e-07, "loss": 0.2886, "step": 1850 }, { "epoch": 0.52, "grad_norm": 2.4560054450403612, "learning_rate": 9.692589034872408e-07, "loss": 0.2949, "step": 1851 }, { "epoch": 0.52, "grad_norm": 2.333345087894509, "learning_rate": 9.683415610631787e-07, "loss": 0.3006, "step": 1852 }, { "epoch": 0.53, "grad_norm": 2.384442083807228, "learning_rate": 9.674242453061935e-07, "loss": 0.2591, "step": 1853 }, { "epoch": 0.53, "grad_norm": 2.3865601767906246, "learning_rate": 9.66506956988974e-07, "loss": 0.2539, "step": 1854 }, { "epoch": 0.53, "grad_norm": 2.3401087584534266, "learning_rate": 9.655896968841872e-07, "loss": 0.2816, "step": 1855 }, { "epoch": 0.53, "grad_norm": 2.4510382444906633, "learning_rate": 9.64672465764474e-07, "loss": 0.2776, "step": 1856 }, { "epoch": 0.53, "grad_norm": 2.6222410730947283, "learning_rate": 9.637552644024526e-07, "loss": 0.2724, "step": 1857 }, { "epoch": 0.53, "grad_norm": 2.0489271774956777, "learning_rate": 9.62838093570716e-07, "loss": 0.2194, "step": 1858 }, { "epoch": 0.53, "grad_norm": 2.4784055906636544, "learning_rate": 9.619209540418306e-07, "loss": 0.2921, "step": 1859 }, { "epoch": 0.53, "grad_norm": 2.1713605870866304, "learning_rate": 9.610038465883376e-07, "loss": 0.2496, "step": 1860 }, { "epoch": 0.53, "grad_norm": 2.39693101648829, "learning_rate": 9.600867719827506e-07, "loss": 0.2992, "step": 1861 }, { "epoch": 0.53, "grad_norm": 2.229116705886729, "learning_rate": 9.591697309975555e-07, "loss": 0.2797, "step": 1862 }, { "epoch": 0.53, "grad_norm": 2.4326025652791423, "learning_rate": 9.582527244052094e-07, "loss": 0.2756, "step": 1863 }, { "epoch": 0.53, "grad_norm": 2.3303058887008707, "learning_rate": 9.573357529781414e-07, "loss": 0.2905, "step": 1864 }, { "epoch": 0.53, "grad_norm": 2.234063548100955, "learning_rate": 9.564188174887503e-07, "loss": 0.266, "step": 1865 }, { "epoch": 0.53, "grad_norm": 2.2828800276675207, "learning_rate": 9.555019187094057e-07, "loss": 0.2989, "step": 1866 }, { "epoch": 0.53, "grad_norm": 2.6430152997452225, "learning_rate": 9.545850574124443e-07, "loss": 0.2702, "step": 1867 }, { "epoch": 0.53, "grad_norm": 2.4975260565196784, "learning_rate": 9.536682343701728e-07, "loss": 0.29, "step": 1868 }, { "epoch": 0.53, "grad_norm": 2.2169356324072207, "learning_rate": 9.527514503548651e-07, "loss": 0.2595, "step": 1869 }, { "epoch": 0.53, "grad_norm": 2.4171320151902154, "learning_rate": 9.518347061387627e-07, "loss": 0.288, "step": 1870 }, { "epoch": 0.53, "grad_norm": 2.4017512468814295, "learning_rate": 9.509180024940734e-07, "loss": 0.3166, "step": 1871 }, { "epoch": 0.53, "grad_norm": 2.417658702684582, "learning_rate": 9.500013401929701e-07, "loss": 0.2598, "step": 1872 }, { "epoch": 0.53, "grad_norm": 2.4955418694217633, "learning_rate": 9.490847200075917e-07, "loss": 0.3007, "step": 1873 }, { "epoch": 0.53, "grad_norm": 2.407401323867513, "learning_rate": 9.48168142710041e-07, "loss": 0.2828, "step": 1874 }, { "epoch": 0.53, "grad_norm": 2.3317221443277116, "learning_rate": 9.472516090723859e-07, "loss": 0.268, "step": 1875 }, { "epoch": 0.53, "grad_norm": 2.633088826167416, "learning_rate": 9.463351198666559e-07, "loss": 0.3279, "step": 1876 }, { "epoch": 0.53, "grad_norm": 2.4694512144741725, "learning_rate": 9.454186758648443e-07, "loss": 0.3348, "step": 1877 }, { "epoch": 0.53, "grad_norm": 2.427311040283762, "learning_rate": 9.445022778389056e-07, "loss": 0.2883, "step": 1878 }, { "epoch": 0.53, "grad_norm": 2.254665154293914, "learning_rate": 9.435859265607554e-07, "loss": 0.2615, "step": 1879 }, { "epoch": 0.53, "grad_norm": 2.4736029632574907, "learning_rate": 9.426696228022713e-07, "loss": 0.2653, "step": 1880 }, { "epoch": 0.53, "grad_norm": 2.358484562573563, "learning_rate": 9.417533673352893e-07, "loss": 0.2898, "step": 1881 }, { "epoch": 0.53, "grad_norm": 2.3562987141421656, "learning_rate": 9.408371609316058e-07, "loss": 0.2452, "step": 1882 }, { "epoch": 0.53, "grad_norm": 2.256556790697905, "learning_rate": 9.39921004362975e-07, "loss": 0.2823, "step": 1883 }, { "epoch": 0.53, "grad_norm": 2.5861784387547755, "learning_rate": 9.390048984011094e-07, "loss": 0.2502, "step": 1884 }, { "epoch": 0.53, "grad_norm": 2.5766242409891094, "learning_rate": 9.380888438176795e-07, "loss": 0.3105, "step": 1885 }, { "epoch": 0.53, "grad_norm": 2.4315420005305888, "learning_rate": 9.37172841384312e-07, "loss": 0.282, "step": 1886 }, { "epoch": 0.53, "grad_norm": 2.7588805558007934, "learning_rate": 9.362568918725895e-07, "loss": 0.277, "step": 1887 }, { "epoch": 0.53, "grad_norm": 2.420971784915736, "learning_rate": 9.353409960540505e-07, "loss": 0.3012, "step": 1888 }, { "epoch": 0.54, "grad_norm": 2.2747877360470765, "learning_rate": 9.344251547001871e-07, "loss": 0.2797, "step": 1889 }, { "epoch": 0.54, "grad_norm": 2.4181692064226716, "learning_rate": 9.335093685824476e-07, "loss": 0.2631, "step": 1890 }, { "epoch": 0.54, "grad_norm": 2.340474100192766, "learning_rate": 9.325936384722321e-07, "loss": 0.2648, "step": 1891 }, { "epoch": 0.54, "grad_norm": 2.4228559382516854, "learning_rate": 9.316779651408939e-07, "loss": 0.2925, "step": 1892 }, { "epoch": 0.54, "grad_norm": 2.523273040640516, "learning_rate": 9.307623493597387e-07, "loss": 0.2793, "step": 1893 }, { "epoch": 0.54, "grad_norm": 2.7145037184668444, "learning_rate": 9.29846791900024e-07, "loss": 0.315, "step": 1894 }, { "epoch": 0.54, "grad_norm": 2.4016932182830613, "learning_rate": 9.289312935329572e-07, "loss": 0.2689, "step": 1895 }, { "epoch": 0.54, "grad_norm": 2.5004609561755666, "learning_rate": 9.280158550296968e-07, "loss": 0.2841, "step": 1896 }, { "epoch": 0.54, "grad_norm": 2.2912353853003617, "learning_rate": 9.271004771613508e-07, "loss": 0.2632, "step": 1897 }, { "epoch": 0.54, "grad_norm": 2.490373208906252, "learning_rate": 9.261851606989753e-07, "loss": 0.2796, "step": 1898 }, { "epoch": 0.54, "grad_norm": 2.3708068137158524, "learning_rate": 9.252699064135758e-07, "loss": 0.2935, "step": 1899 }, { "epoch": 0.54, "grad_norm": 2.211291701138569, "learning_rate": 9.243547150761046e-07, "loss": 0.2753, "step": 1900 }, { "epoch": 0.54, "grad_norm": 2.310505963420102, "learning_rate": 9.23439587457462e-07, "loss": 0.2807, "step": 1901 }, { "epoch": 0.54, "grad_norm": 2.2820307173012457, "learning_rate": 9.22524524328493e-07, "loss": 0.2828, "step": 1902 }, { "epoch": 0.54, "grad_norm": 2.1476374326857575, "learning_rate": 9.216095264599894e-07, "loss": 0.2673, "step": 1903 }, { "epoch": 0.54, "grad_norm": 2.4323240408845486, "learning_rate": 9.206945946226883e-07, "loss": 0.29, "step": 1904 }, { "epoch": 0.54, "grad_norm": 2.6627412851874475, "learning_rate": 9.197797295872708e-07, "loss": 0.2769, "step": 1905 }, { "epoch": 0.54, "grad_norm": 2.530655011578067, "learning_rate": 9.188649321243609e-07, "loss": 0.3114, "step": 1906 }, { "epoch": 0.54, "grad_norm": 2.3937515846923914, "learning_rate": 9.179502030045269e-07, "loss": 0.2785, "step": 1907 }, { "epoch": 0.54, "grad_norm": 2.323366450374401, "learning_rate": 9.170355429982787e-07, "loss": 0.282, "step": 1908 }, { "epoch": 0.54, "grad_norm": 2.354831302609529, "learning_rate": 9.161209528760689e-07, "loss": 0.2577, "step": 1909 }, { "epoch": 0.54, "grad_norm": 2.71004074994221, "learning_rate": 9.152064334082903e-07, "loss": 0.2879, "step": 1910 }, { "epoch": 0.54, "grad_norm": 2.4742529942991918, "learning_rate": 9.142919853652765e-07, "loss": 0.2716, "step": 1911 }, { "epoch": 0.54, "grad_norm": 2.1048299871316023, "learning_rate": 9.133776095173013e-07, "loss": 0.2557, "step": 1912 }, { "epoch": 0.54, "grad_norm": 2.578248531071737, "learning_rate": 9.124633066345768e-07, "loss": 0.2945, "step": 1913 }, { "epoch": 0.54, "grad_norm": 2.916797832424238, "learning_rate": 9.115490774872549e-07, "loss": 0.2802, "step": 1914 }, { "epoch": 0.54, "grad_norm": 2.392195902194485, "learning_rate": 9.106349228454242e-07, "loss": 0.287, "step": 1915 }, { "epoch": 0.54, "grad_norm": 2.37893310147143, "learning_rate": 9.097208434791116e-07, "loss": 0.2834, "step": 1916 }, { "epoch": 0.54, "grad_norm": 2.287075599664048, "learning_rate": 9.088068401582795e-07, "loss": 0.2554, "step": 1917 }, { "epoch": 0.54, "grad_norm": 2.416346774146093, "learning_rate": 9.078929136528267e-07, "loss": 0.2581, "step": 1918 }, { "epoch": 0.54, "grad_norm": 2.353920820538036, "learning_rate": 9.069790647325878e-07, "loss": 0.2793, "step": 1919 }, { "epoch": 0.54, "grad_norm": 2.3798498329467104, "learning_rate": 9.060652941673317e-07, "loss": 0.2813, "step": 1920 }, { "epoch": 0.54, "grad_norm": 2.4914236732800843, "learning_rate": 9.05151602726761e-07, "loss": 0.3002, "step": 1921 }, { "epoch": 0.54, "grad_norm": 2.4316041066701413, "learning_rate": 9.042379911805116e-07, "loss": 0.2804, "step": 1922 }, { "epoch": 0.54, "grad_norm": 2.639715234179413, "learning_rate": 9.033244602981525e-07, "loss": 0.2862, "step": 1923 }, { "epoch": 0.55, "grad_norm": 2.241581435626847, "learning_rate": 9.024110108491853e-07, "loss": 0.2774, "step": 1924 }, { "epoch": 0.55, "grad_norm": 2.856997759615627, "learning_rate": 9.014976436030416e-07, "loss": 0.2824, "step": 1925 }, { "epoch": 0.55, "grad_norm": 2.2585837498129946, "learning_rate": 9.005843593290847e-07, "loss": 0.2812, "step": 1926 }, { "epoch": 0.55, "grad_norm": 2.6046519639500647, "learning_rate": 8.996711587966077e-07, "loss": 0.2876, "step": 1927 }, { "epoch": 0.55, "grad_norm": 2.4420328746980484, "learning_rate": 8.987580427748335e-07, "loss": 0.2839, "step": 1928 }, { "epoch": 0.55, "grad_norm": 2.3624950251203214, "learning_rate": 8.978450120329137e-07, "loss": 0.3064, "step": 1929 }, { "epoch": 0.55, "grad_norm": 2.3272518802728523, "learning_rate": 8.969320673399276e-07, "loss": 0.2786, "step": 1930 }, { "epoch": 0.55, "grad_norm": 2.2731877501718016, "learning_rate": 8.960192094648826e-07, "loss": 0.2765, "step": 1931 }, { "epoch": 0.55, "grad_norm": 2.419747659121121, "learning_rate": 8.951064391767119e-07, "loss": 0.2528, "step": 1932 }, { "epoch": 0.55, "grad_norm": 2.442953839847466, "learning_rate": 8.941937572442773e-07, "loss": 0.2231, "step": 1933 }, { "epoch": 0.55, "grad_norm": 2.387408496361725, "learning_rate": 8.932811644363635e-07, "loss": 0.2664, "step": 1934 }, { "epoch": 0.55, "grad_norm": 2.4922207588724437, "learning_rate": 8.923686615216816e-07, "loss": 0.321, "step": 1935 }, { "epoch": 0.55, "grad_norm": 2.4252013919135114, "learning_rate": 8.914562492688666e-07, "loss": 0.3129, "step": 1936 }, { "epoch": 0.55, "grad_norm": 2.1682085963308153, "learning_rate": 8.905439284464769e-07, "loss": 0.2873, "step": 1937 }, { "epoch": 0.55, "grad_norm": 2.416834129768717, "learning_rate": 8.896316998229946e-07, "loss": 0.2461, "step": 1938 }, { "epoch": 0.55, "grad_norm": 2.195335513474169, "learning_rate": 8.887195641668234e-07, "loss": 0.2644, "step": 1939 }, { "epoch": 0.55, "grad_norm": 2.2883878593039366, "learning_rate": 8.878075222462895e-07, "loss": 0.2704, "step": 1940 }, { "epoch": 0.55, "grad_norm": 2.787774022686844, "learning_rate": 8.86895574829639e-07, "loss": 0.2796, "step": 1941 }, { "epoch": 0.55, "grad_norm": 2.3051247213959707, "learning_rate": 8.859837226850388e-07, "loss": 0.2558, "step": 1942 }, { "epoch": 0.55, "grad_norm": 2.3063454413012825, "learning_rate": 8.850719665805766e-07, "loss": 0.2837, "step": 1943 }, { "epoch": 0.55, "grad_norm": 2.3273022288303054, "learning_rate": 8.841603072842581e-07, "loss": 0.2783, "step": 1944 }, { "epoch": 0.55, "grad_norm": 2.255821650288142, "learning_rate": 8.832487455640074e-07, "loss": 0.2783, "step": 1945 }, { "epoch": 0.55, "grad_norm": 2.4717382690009164, "learning_rate": 8.823372821876671e-07, "loss": 0.2977, "step": 1946 }, { "epoch": 0.55, "grad_norm": 2.257346400197328, "learning_rate": 8.814259179229959e-07, "loss": 0.2843, "step": 1947 }, { "epoch": 0.55, "grad_norm": 2.222491787939881, "learning_rate": 8.805146535376708e-07, "loss": 0.2756, "step": 1948 }, { "epoch": 0.55, "grad_norm": 2.7443046514917997, "learning_rate": 8.796034897992828e-07, "loss": 0.3251, "step": 1949 }, { "epoch": 0.55, "grad_norm": 2.3189270403797373, "learning_rate": 8.78692427475339e-07, "loss": 0.2681, "step": 1950 }, { "epoch": 0.55, "grad_norm": 2.319905410278452, "learning_rate": 8.777814673332614e-07, "loss": 0.2999, "step": 1951 }, { "epoch": 0.55, "grad_norm": 2.3315871825933216, "learning_rate": 8.768706101403847e-07, "loss": 0.2713, "step": 1952 }, { "epoch": 0.55, "grad_norm": 2.585690032496667, "learning_rate": 8.759598566639586e-07, "loss": 0.3387, "step": 1953 }, { "epoch": 0.55, "grad_norm": 2.2257813847959205, "learning_rate": 8.750492076711439e-07, "loss": 0.2633, "step": 1954 }, { "epoch": 0.55, "grad_norm": 2.3438552082983675, "learning_rate": 8.741386639290144e-07, "loss": 0.2721, "step": 1955 }, { "epoch": 0.55, "grad_norm": 2.381209291439057, "learning_rate": 8.732282262045545e-07, "loss": 0.294, "step": 1956 }, { "epoch": 0.55, "grad_norm": 2.383293998015045, "learning_rate": 8.723178952646595e-07, "loss": 0.2625, "step": 1957 }, { "epoch": 0.55, "grad_norm": 2.3888215246027644, "learning_rate": 8.714076718761355e-07, "loss": 0.2833, "step": 1958 }, { "epoch": 0.56, "grad_norm": 2.484679974607535, "learning_rate": 8.704975568056974e-07, "loss": 0.2762, "step": 1959 }, { "epoch": 0.56, "grad_norm": 2.7262266793563943, "learning_rate": 8.695875508199682e-07, "loss": 0.2734, "step": 1960 }, { "epoch": 0.56, "grad_norm": 2.4748011500503755, "learning_rate": 8.686776546854799e-07, "loss": 0.2844, "step": 1961 }, { "epoch": 0.56, "grad_norm": 2.4158474466530744, "learning_rate": 8.677678691686721e-07, "loss": 0.2933, "step": 1962 }, { "epoch": 0.56, "grad_norm": 2.365395676527773, "learning_rate": 8.668581950358909e-07, "loss": 0.2819, "step": 1963 }, { "epoch": 0.56, "grad_norm": 2.308616132611635, "learning_rate": 8.659486330533881e-07, "loss": 0.2717, "step": 1964 }, { "epoch": 0.56, "grad_norm": 2.1834999486117583, "learning_rate": 8.650391839873217e-07, "loss": 0.2787, "step": 1965 }, { "epoch": 0.56, "grad_norm": 2.461144597727408, "learning_rate": 8.641298486037543e-07, "loss": 0.3018, "step": 1966 }, { "epoch": 0.56, "grad_norm": 2.80762751416142, "learning_rate": 8.632206276686532e-07, "loss": 0.292, "step": 1967 }, { "epoch": 0.56, "grad_norm": 2.250135073102463, "learning_rate": 8.623115219478884e-07, "loss": 0.2788, "step": 1968 }, { "epoch": 0.56, "grad_norm": 2.248508050847324, "learning_rate": 8.614025322072336e-07, "loss": 0.2968, "step": 1969 }, { "epoch": 0.56, "grad_norm": 2.1565106717006195, "learning_rate": 8.604936592123646e-07, "loss": 0.2758, "step": 1970 }, { "epoch": 0.56, "grad_norm": 2.3513480691328814, "learning_rate": 8.595849037288581e-07, "loss": 0.25, "step": 1971 }, { "epoch": 0.56, "grad_norm": 2.564839835756159, "learning_rate": 8.586762665221938e-07, "loss": 0.2919, "step": 1972 }, { "epoch": 0.56, "grad_norm": 2.5031125040120625, "learning_rate": 8.577677483577496e-07, "loss": 0.3025, "step": 1973 }, { "epoch": 0.56, "grad_norm": 2.3757684711077345, "learning_rate": 8.568593500008046e-07, "loss": 0.2957, "step": 1974 }, { "epoch": 0.56, "grad_norm": 2.4814182838221437, "learning_rate": 8.559510722165359e-07, "loss": 0.296, "step": 1975 }, { "epoch": 0.56, "grad_norm": 2.1058424660903623, "learning_rate": 8.550429157700195e-07, "loss": 0.2587, "step": 1976 }, { "epoch": 0.56, "grad_norm": 2.3051111370602984, "learning_rate": 8.541348814262297e-07, "loss": 0.2773, "step": 1977 }, { "epoch": 0.56, "grad_norm": 2.379656401763318, "learning_rate": 8.532269699500376e-07, "loss": 0.278, "step": 1978 }, { "epoch": 0.56, "grad_norm": 2.128141672834981, "learning_rate": 8.523191821062101e-07, "loss": 0.2422, "step": 1979 }, { "epoch": 0.56, "grad_norm": 2.4154798555989068, "learning_rate": 8.51411518659411e-07, "loss": 0.3067, "step": 1980 }, { "epoch": 0.56, "grad_norm": 2.4147085149757053, "learning_rate": 8.505039803741985e-07, "loss": 0.285, "step": 1981 }, { "epoch": 0.56, "grad_norm": 2.260022512654864, "learning_rate": 8.49596568015026e-07, "loss": 0.2586, "step": 1982 }, { "epoch": 0.56, "grad_norm": 2.327455245309484, "learning_rate": 8.486892823462409e-07, "loss": 0.286, "step": 1983 }, { "epoch": 0.56, "grad_norm": 2.500033776529244, "learning_rate": 8.47782124132083e-07, "loss": 0.2992, "step": 1984 }, { "epoch": 0.56, "grad_norm": 2.444891884703191, "learning_rate": 8.468750941366858e-07, "loss": 0.2911, "step": 1985 }, { "epoch": 0.56, "grad_norm": 2.284667592281283, "learning_rate": 8.459681931240732e-07, "loss": 0.2759, "step": 1986 }, { "epoch": 0.56, "grad_norm": 2.4878491891950203, "learning_rate": 8.45061421858163e-07, "loss": 0.2881, "step": 1987 }, { "epoch": 0.56, "grad_norm": 2.3148743275286594, "learning_rate": 8.441547811027614e-07, "loss": 0.2717, "step": 1988 }, { "epoch": 0.56, "grad_norm": 2.246090838636193, "learning_rate": 8.432482716215661e-07, "loss": 0.2686, "step": 1989 }, { "epoch": 0.56, "grad_norm": 2.4017844104732218, "learning_rate": 8.423418941781628e-07, "loss": 0.2761, "step": 1990 }, { "epoch": 0.56, "grad_norm": 2.4218579296866674, "learning_rate": 8.414356495360273e-07, "loss": 0.3065, "step": 1991 }, { "epoch": 0.56, "grad_norm": 2.341440126862159, "learning_rate": 8.405295384585231e-07, "loss": 0.2606, "step": 1992 }, { "epoch": 0.56, "grad_norm": 2.292885993151981, "learning_rate": 8.396235617089012e-07, "loss": 0.2555, "step": 1993 }, { "epoch": 0.56, "grad_norm": 2.341814398483769, "learning_rate": 8.387177200502995e-07, "loss": 0.266, "step": 1994 }, { "epoch": 0.57, "grad_norm": 2.296872307085636, "learning_rate": 8.378120142457414e-07, "loss": 0.2583, "step": 1995 }, { "epoch": 0.57, "grad_norm": 2.3580787443847817, "learning_rate": 8.369064450581372e-07, "loss": 0.3061, "step": 1996 }, { "epoch": 0.57, "grad_norm": 2.3394656647930487, "learning_rate": 8.360010132502811e-07, "loss": 0.2603, "step": 1997 }, { "epoch": 0.57, "grad_norm": 2.326461942946112, "learning_rate": 8.35095719584852e-07, "loss": 0.2937, "step": 1998 }, { "epoch": 0.57, "grad_norm": 2.9975048170259715, "learning_rate": 8.34190564824412e-07, "loss": 0.2967, "step": 1999 }, { "epoch": 0.57, "grad_norm": 2.1577062774756857, "learning_rate": 8.332855497314066e-07, "loss": 0.249, "step": 2000 }, { "epoch": 0.57, "grad_norm": 2.2756120324360114, "learning_rate": 8.32380675068164e-07, "loss": 0.2726, "step": 2001 }, { "epoch": 0.57, "grad_norm": 2.6199877483436027, "learning_rate": 8.314759415968935e-07, "loss": 0.2773, "step": 2002 }, { "epoch": 0.57, "grad_norm": 2.4368100866555142, "learning_rate": 8.305713500796851e-07, "loss": 0.2852, "step": 2003 }, { "epoch": 0.57, "grad_norm": 2.3744985075641285, "learning_rate": 8.296669012785104e-07, "loss": 0.2776, "step": 2004 }, { "epoch": 0.57, "grad_norm": 5.917189729911316, "learning_rate": 8.287625959552198e-07, "loss": 0.2689, "step": 2005 }, { "epoch": 0.57, "grad_norm": 2.355544152202805, "learning_rate": 8.278584348715436e-07, "loss": 0.2588, "step": 2006 }, { "epoch": 0.57, "grad_norm": 2.355190741627796, "learning_rate": 8.269544187890898e-07, "loss": 0.2782, "step": 2007 }, { "epoch": 0.57, "grad_norm": 2.4253597317377102, "learning_rate": 8.260505484693448e-07, "loss": 0.2703, "step": 2008 }, { "epoch": 0.57, "grad_norm": 2.433583526241199, "learning_rate": 8.251468246736724e-07, "loss": 0.2638, "step": 2009 }, { "epoch": 0.57, "grad_norm": 2.199948379924113, "learning_rate": 8.242432481633118e-07, "loss": 0.261, "step": 2010 }, { "epoch": 0.57, "grad_norm": 2.5886116191736344, "learning_rate": 8.233398196993798e-07, "loss": 0.2925, "step": 2011 }, { "epoch": 0.57, "grad_norm": 2.2534242765762365, "learning_rate": 8.224365400428674e-07, "loss": 0.2674, "step": 2012 }, { "epoch": 0.57, "grad_norm": 2.272714618279941, "learning_rate": 8.215334099546409e-07, "loss": 0.29, "step": 2013 }, { "epoch": 0.57, "grad_norm": 2.430180054552849, "learning_rate": 8.206304301954396e-07, "loss": 0.2953, "step": 2014 }, { "epoch": 0.57, "grad_norm": 2.352628694363435, "learning_rate": 8.197276015258772e-07, "loss": 0.26, "step": 2015 }, { "epoch": 0.57, "grad_norm": 2.416566753780975, "learning_rate": 8.188249247064398e-07, "loss": 0.3085, "step": 2016 }, { "epoch": 0.57, "grad_norm": 2.3952528996210893, "learning_rate": 8.179224004974856e-07, "loss": 0.27, "step": 2017 }, { "epoch": 0.57, "grad_norm": 2.4678308055327314, "learning_rate": 8.17020029659244e-07, "loss": 0.2753, "step": 2018 }, { "epoch": 0.57, "grad_norm": 2.2562591329701824, "learning_rate": 8.161178129518154e-07, "loss": 0.2535, "step": 2019 }, { "epoch": 0.57, "grad_norm": 2.429259179192542, "learning_rate": 8.152157511351703e-07, "loss": 0.2736, "step": 2020 }, { "epoch": 0.57, "grad_norm": 2.4260375660546294, "learning_rate": 8.143138449691495e-07, "loss": 0.2932, "step": 2021 }, { "epoch": 0.57, "grad_norm": 2.3616449561041235, "learning_rate": 8.134120952134613e-07, "loss": 0.2741, "step": 2022 }, { "epoch": 0.57, "grad_norm": 2.3250813299091413, "learning_rate": 8.125105026276831e-07, "loss": 0.2833, "step": 2023 }, { "epoch": 0.57, "grad_norm": 2.252788985570452, "learning_rate": 8.116090679712599e-07, "loss": 0.2829, "step": 2024 }, { "epoch": 0.57, "grad_norm": 2.5183270412019643, "learning_rate": 8.107077920035031e-07, "loss": 0.2796, "step": 2025 }, { "epoch": 0.57, "grad_norm": 2.37778294908015, "learning_rate": 8.098066754835915e-07, "loss": 0.2816, "step": 2026 }, { "epoch": 0.57, "grad_norm": 2.300737037482741, "learning_rate": 8.089057191705686e-07, "loss": 0.258, "step": 2027 }, { "epoch": 0.57, "grad_norm": 2.679227795168687, "learning_rate": 8.080049238233438e-07, "loss": 0.2379, "step": 2028 }, { "epoch": 0.57, "grad_norm": 2.7613834098675243, "learning_rate": 8.071042902006895e-07, "loss": 0.328, "step": 2029 }, { "epoch": 0.58, "grad_norm": 2.3468717263559764, "learning_rate": 8.06203819061243e-07, "loss": 0.2647, "step": 2030 }, { "epoch": 0.58, "grad_norm": 2.3531096490494323, "learning_rate": 8.053035111635053e-07, "loss": 0.2864, "step": 2031 }, { "epoch": 0.58, "grad_norm": 2.414012788939637, "learning_rate": 8.044033672658386e-07, "loss": 0.292, "step": 2032 }, { "epoch": 0.58, "grad_norm": 2.2777315812282772, "learning_rate": 8.035033881264674e-07, "loss": 0.2596, "step": 2033 }, { "epoch": 0.58, "grad_norm": 2.4517357307262144, "learning_rate": 8.026035745034773e-07, "loss": 0.2746, "step": 2034 }, { "epoch": 0.58, "grad_norm": 2.4070537452132297, "learning_rate": 8.017039271548154e-07, "loss": 0.2971, "step": 2035 }, { "epoch": 0.58, "grad_norm": 2.247094822024942, "learning_rate": 8.008044468382876e-07, "loss": 0.2794, "step": 2036 }, { "epoch": 0.58, "grad_norm": 2.306978962744317, "learning_rate": 7.999051343115595e-07, "loss": 0.2655, "step": 2037 }, { "epoch": 0.58, "grad_norm": 2.2123055046356663, "learning_rate": 7.990059903321552e-07, "loss": 0.2664, "step": 2038 }, { "epoch": 0.58, "grad_norm": 2.2889931885830315, "learning_rate": 7.981070156574571e-07, "loss": 0.2636, "step": 2039 }, { "epoch": 0.58, "grad_norm": 2.478118432621666, "learning_rate": 7.972082110447051e-07, "loss": 0.276, "step": 2040 }, { "epoch": 0.58, "grad_norm": 2.2579090871085845, "learning_rate": 7.963095772509959e-07, "loss": 0.2579, "step": 2041 }, { "epoch": 0.58, "grad_norm": 2.236066241421122, "learning_rate": 7.954111150332814e-07, "loss": 0.244, "step": 2042 }, { "epoch": 0.58, "grad_norm": 2.3660082996037533, "learning_rate": 7.945128251483702e-07, "loss": 0.2734, "step": 2043 }, { "epoch": 0.58, "grad_norm": 2.359454646610675, "learning_rate": 7.936147083529243e-07, "loss": 0.2573, "step": 2044 }, { "epoch": 0.58, "grad_norm": 2.236496257553659, "learning_rate": 7.927167654034621e-07, "loss": 0.2801, "step": 2045 }, { "epoch": 0.58, "grad_norm": 3.133062542389784, "learning_rate": 7.918189970563534e-07, "loss": 0.2848, "step": 2046 }, { "epoch": 0.58, "grad_norm": 2.2905808901519253, "learning_rate": 7.909214040678219e-07, "loss": 0.3012, "step": 2047 }, { "epoch": 0.58, "grad_norm": 2.3380895198629887, "learning_rate": 7.900239871939434e-07, "loss": 0.278, "step": 2048 }, { "epoch": 0.58, "grad_norm": 2.2480017735021103, "learning_rate": 7.891267471906451e-07, "loss": 0.2631, "step": 2049 }, { "epoch": 0.58, "grad_norm": 2.4311189635822004, "learning_rate": 7.882296848137063e-07, "loss": 0.2801, "step": 2050 }, { "epoch": 0.58, "grad_norm": 2.5144487494732086, "learning_rate": 7.873328008187553e-07, "loss": 0.3047, "step": 2051 }, { "epoch": 0.58, "grad_norm": 2.1876903102504475, "learning_rate": 7.864360959612713e-07, "loss": 0.2638, "step": 2052 }, { "epoch": 0.58, "grad_norm": 2.2277035817996653, "learning_rate": 7.855395709965813e-07, "loss": 0.2514, "step": 2053 }, { "epoch": 0.58, "grad_norm": 2.491217536073013, "learning_rate": 7.846432266798618e-07, "loss": 0.2838, "step": 2054 }, { "epoch": 0.58, "grad_norm": 2.3602798122236064, "learning_rate": 7.83747063766137e-07, "loss": 0.2715, "step": 2055 }, { "epoch": 0.58, "grad_norm": 2.5464354937602813, "learning_rate": 7.828510830102784e-07, "loss": 0.2894, "step": 2056 }, { "epoch": 0.58, "grad_norm": 2.2870974956544288, "learning_rate": 7.819552851670032e-07, "loss": 0.2697, "step": 2057 }, { "epoch": 0.58, "grad_norm": 2.3234172641559288, "learning_rate": 7.810596709908758e-07, "loss": 0.2723, "step": 2058 }, { "epoch": 0.58, "grad_norm": 2.414991920888712, "learning_rate": 7.801642412363041e-07, "loss": 0.274, "step": 2059 }, { "epoch": 0.58, "grad_norm": 2.444712542812734, "learning_rate": 7.792689966575432e-07, "loss": 0.2814, "step": 2060 }, { "epoch": 0.58, "grad_norm": 2.47079382630738, "learning_rate": 7.7837393800869e-07, "loss": 0.2675, "step": 2061 }, { "epoch": 0.58, "grad_norm": 2.1684754178540895, "learning_rate": 7.774790660436857e-07, "loss": 0.2669, "step": 2062 }, { "epoch": 0.58, "grad_norm": 2.460092218537868, "learning_rate": 7.765843815163142e-07, "loss": 0.2804, "step": 2063 }, { "epoch": 0.58, "grad_norm": 2.433323079334218, "learning_rate": 7.756898851802012e-07, "loss": 0.2714, "step": 2064 }, { "epoch": 0.59, "grad_norm": 2.537186553166969, "learning_rate": 7.747955777888144e-07, "loss": 0.2474, "step": 2065 }, { "epoch": 0.59, "grad_norm": 2.267636696071962, "learning_rate": 7.739014600954621e-07, "loss": 0.2699, "step": 2066 }, { "epoch": 0.59, "grad_norm": 2.430074544000473, "learning_rate": 7.730075328532929e-07, "loss": 0.3004, "step": 2067 }, { "epoch": 0.59, "grad_norm": 2.4201031125203274, "learning_rate": 7.721137968152943e-07, "loss": 0.2731, "step": 2068 }, { "epoch": 0.59, "grad_norm": 2.290956179149082, "learning_rate": 7.712202527342936e-07, "loss": 0.2916, "step": 2069 }, { "epoch": 0.59, "grad_norm": 2.4089321122557297, "learning_rate": 7.703269013629563e-07, "loss": 0.2728, "step": 2070 }, { "epoch": 0.59, "grad_norm": 2.276232550384149, "learning_rate": 7.694337434537855e-07, "loss": 0.2448, "step": 2071 }, { "epoch": 0.59, "grad_norm": 2.496023360365651, "learning_rate": 7.685407797591207e-07, "loss": 0.3067, "step": 2072 }, { "epoch": 0.59, "grad_norm": 2.4890362420157524, "learning_rate": 7.676480110311384e-07, "loss": 0.2688, "step": 2073 }, { "epoch": 0.59, "grad_norm": 2.5525509171952554, "learning_rate": 7.667554380218512e-07, "loss": 0.2882, "step": 2074 }, { "epoch": 0.59, "grad_norm": 2.4611262943609726, "learning_rate": 7.658630614831064e-07, "loss": 0.2698, "step": 2075 }, { "epoch": 0.59, "grad_norm": 2.4864295885236416, "learning_rate": 7.649708821665855e-07, "loss": 0.2881, "step": 2076 }, { "epoch": 0.59, "grad_norm": 2.2472701497969103, "learning_rate": 7.640789008238044e-07, "loss": 0.2778, "step": 2077 }, { "epoch": 0.59, "grad_norm": 2.195989671921942, "learning_rate": 7.631871182061117e-07, "loss": 0.2975, "step": 2078 }, { "epoch": 0.59, "grad_norm": 2.476679902794854, "learning_rate": 7.622955350646898e-07, "loss": 0.2998, "step": 2079 }, { "epoch": 0.59, "grad_norm": 2.696686446155788, "learning_rate": 7.614041521505517e-07, "loss": 0.3025, "step": 2080 }, { "epoch": 0.59, "grad_norm": 2.3976222679749406, "learning_rate": 7.605129702145421e-07, "loss": 0.2662, "step": 2081 }, { "epoch": 0.59, "grad_norm": 2.3738204956727182, "learning_rate": 7.59621990007337e-07, "loss": 0.2548, "step": 2082 }, { "epoch": 0.59, "grad_norm": 2.3700727964945068, "learning_rate": 7.587312122794413e-07, "loss": 0.3058, "step": 2083 }, { "epoch": 0.59, "grad_norm": 2.265968246116298, "learning_rate": 7.578406377811914e-07, "loss": 0.3043, "step": 2084 }, { "epoch": 0.59, "grad_norm": 2.3290623178260126, "learning_rate": 7.569502672627502e-07, "loss": 0.2747, "step": 2085 }, { "epoch": 0.59, "grad_norm": 2.233856682563935, "learning_rate": 7.560601014741101e-07, "loss": 0.271, "step": 2086 }, { "epoch": 0.59, "grad_norm": 2.394115009432413, "learning_rate": 7.551701411650908e-07, "loss": 0.2599, "step": 2087 }, { "epoch": 0.59, "grad_norm": 2.313562738069459, "learning_rate": 7.542803870853385e-07, "loss": 0.2798, "step": 2088 }, { "epoch": 0.59, "grad_norm": 2.4791031646621047, "learning_rate": 7.533908399843265e-07, "loss": 0.3062, "step": 2089 }, { "epoch": 0.59, "grad_norm": 2.2719449030756977, "learning_rate": 7.525015006113536e-07, "loss": 0.2697, "step": 2090 }, { "epoch": 0.59, "grad_norm": 2.288872717976602, "learning_rate": 7.516123697155423e-07, "loss": 0.2292, "step": 2091 }, { "epoch": 0.59, "grad_norm": 2.3190728757400314, "learning_rate": 7.507234480458413e-07, "loss": 0.2436, "step": 2092 }, { "epoch": 0.59, "grad_norm": 2.437882538102759, "learning_rate": 7.498347363510219e-07, "loss": 0.2779, "step": 2093 }, { "epoch": 0.59, "grad_norm": 2.2839625441224283, "learning_rate": 7.489462353796792e-07, "loss": 0.2652, "step": 2094 }, { "epoch": 0.59, "grad_norm": 2.461952777175989, "learning_rate": 7.480579458802307e-07, "loss": 0.2964, "step": 2095 }, { "epoch": 0.59, "grad_norm": 2.2354629485291837, "learning_rate": 7.471698686009149e-07, "loss": 0.2695, "step": 2096 }, { "epoch": 0.59, "grad_norm": 2.4767741728643218, "learning_rate": 7.46282004289793e-07, "loss": 0.2797, "step": 2097 }, { "epoch": 0.59, "grad_norm": 2.435906585416531, "learning_rate": 7.453943536947449e-07, "loss": 0.2383, "step": 2098 }, { "epoch": 0.59, "grad_norm": 2.3346098559962103, "learning_rate": 7.44506917563473e-07, "loss": 0.278, "step": 2099 }, { "epoch": 0.6, "grad_norm": 2.6073668597304467, "learning_rate": 7.436196966434967e-07, "loss": 0.3092, "step": 2100 }, { "epoch": 0.6, "grad_norm": 2.342933109587325, "learning_rate": 7.427326916821557e-07, "loss": 0.2889, "step": 2101 }, { "epoch": 0.6, "grad_norm": 2.410578090840819, "learning_rate": 7.41845903426606e-07, "loss": 0.3085, "step": 2102 }, { "epoch": 0.6, "grad_norm": 2.525339056164691, "learning_rate": 7.409593326238238e-07, "loss": 0.2591, "step": 2103 }, { "epoch": 0.6, "grad_norm": 2.2445227765026416, "learning_rate": 7.400729800205996e-07, "loss": 0.2499, "step": 2104 }, { "epoch": 0.6, "grad_norm": 2.3990383814636984, "learning_rate": 7.391868463635412e-07, "loss": 0.2688, "step": 2105 }, { "epoch": 0.6, "grad_norm": 2.2685711781633047, "learning_rate": 7.383009323990722e-07, "loss": 0.2827, "step": 2106 }, { "epoch": 0.6, "grad_norm": 2.3055301604055773, "learning_rate": 7.3741523887343e-07, "loss": 0.2671, "step": 2107 }, { "epoch": 0.6, "grad_norm": 2.557224320802981, "learning_rate": 7.365297665326677e-07, "loss": 0.2837, "step": 2108 }, { "epoch": 0.6, "grad_norm": 2.534642480368157, "learning_rate": 7.356445161226515e-07, "loss": 0.2691, "step": 2109 }, { "epoch": 0.6, "grad_norm": 2.356329998491783, "learning_rate": 7.347594883890607e-07, "loss": 0.2751, "step": 2110 }, { "epoch": 0.6, "grad_norm": 2.381325655091747, "learning_rate": 7.338746840773865e-07, "loss": 0.2565, "step": 2111 }, { "epoch": 0.6, "grad_norm": 2.329511980402188, "learning_rate": 7.329901039329325e-07, "loss": 0.263, "step": 2112 }, { "epoch": 0.6, "grad_norm": 2.315894476466652, "learning_rate": 7.321057487008135e-07, "loss": 0.3127, "step": 2113 }, { "epoch": 0.6, "grad_norm": 2.2856850537832103, "learning_rate": 7.312216191259551e-07, "loss": 0.2547, "step": 2114 }, { "epoch": 0.6, "grad_norm": 2.421444178537356, "learning_rate": 7.303377159530918e-07, "loss": 0.3091, "step": 2115 }, { "epoch": 0.6, "grad_norm": 2.243237220702003, "learning_rate": 7.294540399267682e-07, "loss": 0.2723, "step": 2116 }, { "epoch": 0.6, "grad_norm": 2.5157378247007247, "learning_rate": 7.285705917913372e-07, "loss": 0.272, "step": 2117 }, { "epoch": 0.6, "grad_norm": 2.453951629479585, "learning_rate": 7.276873722909604e-07, "loss": 0.2891, "step": 2118 }, { "epoch": 0.6, "grad_norm": 2.300751946901452, "learning_rate": 7.268043821696062e-07, "loss": 0.2803, "step": 2119 }, { "epoch": 0.6, "grad_norm": 2.5899121934419096, "learning_rate": 7.259216221710495e-07, "loss": 0.2943, "step": 2120 }, { "epoch": 0.6, "grad_norm": 2.2701438074597586, "learning_rate": 7.250390930388723e-07, "loss": 0.2895, "step": 2121 }, { "epoch": 0.6, "grad_norm": 3.732218819170221, "learning_rate": 7.241567955164609e-07, "loss": 0.2743, "step": 2122 }, { "epoch": 0.6, "grad_norm": 2.3387796014090907, "learning_rate": 7.232747303470081e-07, "loss": 0.2776, "step": 2123 }, { "epoch": 0.6, "grad_norm": 2.5764272321852286, "learning_rate": 7.223928982735095e-07, "loss": 0.2892, "step": 2124 }, { "epoch": 0.6, "grad_norm": 2.383832218103511, "learning_rate": 7.215113000387653e-07, "loss": 0.2517, "step": 2125 }, { "epoch": 0.6, "grad_norm": 2.464831541878814, "learning_rate": 7.206299363853781e-07, "loss": 0.2689, "step": 2126 }, { "epoch": 0.6, "grad_norm": 2.471104492666639, "learning_rate": 7.19748808055753e-07, "loss": 0.2722, "step": 2127 }, { "epoch": 0.6, "grad_norm": 3.329307106617614, "learning_rate": 7.188679157920976e-07, "loss": 0.2766, "step": 2128 }, { "epoch": 0.6, "grad_norm": 2.3363762963465784, "learning_rate": 7.179872603364199e-07, "loss": 0.2662, "step": 2129 }, { "epoch": 0.6, "grad_norm": 2.517632149515168, "learning_rate": 7.171068424305286e-07, "loss": 0.2746, "step": 2130 }, { "epoch": 0.6, "grad_norm": 2.5706870497329652, "learning_rate": 7.162266628160322e-07, "loss": 0.2633, "step": 2131 }, { "epoch": 0.6, "grad_norm": 2.5354944977016265, "learning_rate": 7.153467222343386e-07, "loss": 0.2902, "step": 2132 }, { "epoch": 0.6, "grad_norm": 2.7456345440150387, "learning_rate": 7.144670214266551e-07, "loss": 0.287, "step": 2133 }, { "epoch": 0.6, "grad_norm": 2.3894076576548855, "learning_rate": 7.135875611339853e-07, "loss": 0.3036, "step": 2134 }, { "epoch": 0.6, "grad_norm": 2.569731455695933, "learning_rate": 7.127083420971319e-07, "loss": 0.2747, "step": 2135 }, { "epoch": 0.61, "grad_norm": 2.2928767146566758, "learning_rate": 7.11829365056693e-07, "loss": 0.2583, "step": 2136 }, { "epoch": 0.61, "grad_norm": 2.673719868392139, "learning_rate": 7.109506307530645e-07, "loss": 0.2716, "step": 2137 }, { "epoch": 0.61, "grad_norm": 2.3579020039258674, "learning_rate": 7.100721399264362e-07, "loss": 0.2868, "step": 2138 }, { "epoch": 0.61, "grad_norm": 2.266348724055388, "learning_rate": 7.091938933167936e-07, "loss": 0.2455, "step": 2139 }, { "epoch": 0.61, "grad_norm": 2.227236593653924, "learning_rate": 7.083158916639168e-07, "loss": 0.2457, "step": 2140 }, { "epoch": 0.61, "grad_norm": 2.332130191191915, "learning_rate": 7.074381357073781e-07, "loss": 0.2814, "step": 2141 }, { "epoch": 0.61, "grad_norm": 2.3944392677131745, "learning_rate": 7.065606261865452e-07, "loss": 0.2994, "step": 2142 }, { "epoch": 0.61, "grad_norm": 2.4322935118276092, "learning_rate": 7.056833638405761e-07, "loss": 0.3136, "step": 2143 }, { "epoch": 0.61, "grad_norm": 2.5671930014493745, "learning_rate": 7.048063494084218e-07, "loss": 0.2926, "step": 2144 }, { "epoch": 0.61, "grad_norm": 2.318564278312186, "learning_rate": 7.039295836288237e-07, "loss": 0.2545, "step": 2145 }, { "epoch": 0.61, "grad_norm": 2.464705863159734, "learning_rate": 7.030530672403138e-07, "loss": 0.2708, "step": 2146 }, { "epoch": 0.61, "grad_norm": 2.19609942264482, "learning_rate": 7.021768009812155e-07, "loss": 0.2596, "step": 2147 }, { "epoch": 0.61, "grad_norm": 2.48676163657945, "learning_rate": 7.013007855896396e-07, "loss": 0.2627, "step": 2148 }, { "epoch": 0.61, "grad_norm": 2.3182899968588386, "learning_rate": 7.004250218034863e-07, "loss": 0.279, "step": 2149 }, { "epoch": 0.61, "grad_norm": 2.3442020814482314, "learning_rate": 6.99549510360444e-07, "loss": 0.2704, "step": 2150 }, { "epoch": 0.61, "grad_norm": 2.22906721368112, "learning_rate": 6.986742519979883e-07, "loss": 0.2606, "step": 2151 }, { "epoch": 0.61, "grad_norm": 2.2962396809815453, "learning_rate": 6.977992474533823e-07, "loss": 0.2615, "step": 2152 }, { "epoch": 0.61, "grad_norm": 2.385100652981511, "learning_rate": 6.969244974636744e-07, "loss": 0.3269, "step": 2153 }, { "epoch": 0.61, "grad_norm": 2.5104597172170724, "learning_rate": 6.960500027656989e-07, "loss": 0.2623, "step": 2154 }, { "epoch": 0.61, "grad_norm": 2.343953006786303, "learning_rate": 6.951757640960753e-07, "loss": 0.2832, "step": 2155 }, { "epoch": 0.61, "grad_norm": 2.3828926943026008, "learning_rate": 6.943017821912068e-07, "loss": 0.3251, "step": 2156 }, { "epoch": 0.61, "grad_norm": 2.989832235987735, "learning_rate": 6.934280577872813e-07, "loss": 0.2892, "step": 2157 }, { "epoch": 0.61, "grad_norm": 2.724481526794439, "learning_rate": 6.925545916202691e-07, "loss": 0.2734, "step": 2158 }, { "epoch": 0.61, "grad_norm": 2.428029211150564, "learning_rate": 6.916813844259233e-07, "loss": 0.3051, "step": 2159 }, { "epoch": 0.61, "grad_norm": 2.259379386342519, "learning_rate": 6.908084369397782e-07, "loss": 0.2725, "step": 2160 }, { "epoch": 0.61, "grad_norm": 2.402931976669789, "learning_rate": 6.899357498971499e-07, "loss": 0.3116, "step": 2161 }, { "epoch": 0.61, "grad_norm": 2.316897428090023, "learning_rate": 6.890633240331353e-07, "loss": 0.2725, "step": 2162 }, { "epoch": 0.61, "grad_norm": 2.2528634833638503, "learning_rate": 6.881911600826114e-07, "loss": 0.2364, "step": 2163 }, { "epoch": 0.61, "grad_norm": 2.3508074001500647, "learning_rate": 6.873192587802339e-07, "loss": 0.273, "step": 2164 }, { "epoch": 0.61, "grad_norm": 2.460979430802097, "learning_rate": 6.864476208604373e-07, "loss": 0.2938, "step": 2165 }, { "epoch": 0.61, "grad_norm": 2.2576345269648708, "learning_rate": 6.855762470574344e-07, "loss": 0.2741, "step": 2166 }, { "epoch": 0.61, "grad_norm": 2.4334518552088227, "learning_rate": 6.847051381052165e-07, "loss": 0.2845, "step": 2167 }, { "epoch": 0.61, "grad_norm": 2.3696143758251127, "learning_rate": 6.838342947375506e-07, "loss": 0.2722, "step": 2168 }, { "epoch": 0.61, "grad_norm": 2.32490402082567, "learning_rate": 6.829637176879801e-07, "loss": 0.2925, "step": 2169 }, { "epoch": 0.61, "grad_norm": 2.5248417813499, "learning_rate": 6.820934076898246e-07, "loss": 0.2985, "step": 2170 }, { "epoch": 0.62, "grad_norm": 2.445099598380385, "learning_rate": 6.812233654761779e-07, "loss": 0.3044, "step": 2171 }, { "epoch": 0.62, "grad_norm": 2.214657241657144, "learning_rate": 6.803535917799097e-07, "loss": 0.2414, "step": 2172 }, { "epoch": 0.62, "grad_norm": 2.3655223286007603, "learning_rate": 6.794840873336622e-07, "loss": 0.2806, "step": 2173 }, { "epoch": 0.62, "grad_norm": 2.4011629273315696, "learning_rate": 6.786148528698511e-07, "loss": 0.298, "step": 2174 }, { "epoch": 0.62, "grad_norm": 2.3645904484074682, "learning_rate": 6.777458891206647e-07, "loss": 0.2872, "step": 2175 }, { "epoch": 0.62, "grad_norm": 2.3303417451654598, "learning_rate": 6.768771968180642e-07, "loss": 0.2827, "step": 2176 }, { "epoch": 0.62, "grad_norm": 2.42116804321365, "learning_rate": 6.760087766937806e-07, "loss": 0.287, "step": 2177 }, { "epoch": 0.62, "grad_norm": 2.382779946387148, "learning_rate": 6.751406294793165e-07, "loss": 0.2552, "step": 2178 }, { "epoch": 0.62, "grad_norm": 2.477672356064572, "learning_rate": 6.742727559059447e-07, "loss": 0.3088, "step": 2179 }, { "epoch": 0.62, "grad_norm": 2.455414571828824, "learning_rate": 6.734051567047067e-07, "loss": 0.2795, "step": 2180 }, { "epoch": 0.62, "grad_norm": 2.351414062300336, "learning_rate": 6.72537832606414e-07, "loss": 0.282, "step": 2181 }, { "epoch": 0.62, "grad_norm": 2.22111778868477, "learning_rate": 6.716707843416459e-07, "loss": 0.2564, "step": 2182 }, { "epoch": 0.62, "grad_norm": 2.175827676310216, "learning_rate": 6.708040126407492e-07, "loss": 0.2521, "step": 2183 }, { "epoch": 0.62, "grad_norm": 2.5166909133215754, "learning_rate": 6.699375182338378e-07, "loss": 0.2835, "step": 2184 }, { "epoch": 0.62, "grad_norm": 2.267037067854833, "learning_rate": 6.690713018507916e-07, "loss": 0.262, "step": 2185 }, { "epoch": 0.62, "grad_norm": 2.288265638185063, "learning_rate": 6.682053642212575e-07, "loss": 0.2488, "step": 2186 }, { "epoch": 0.62, "grad_norm": 2.2731622558940687, "learning_rate": 6.673397060746469e-07, "loss": 0.2665, "step": 2187 }, { "epoch": 0.62, "grad_norm": 2.4917477923811013, "learning_rate": 6.664743281401351e-07, "loss": 0.292, "step": 2188 }, { "epoch": 0.62, "grad_norm": 2.3827017764759604, "learning_rate": 6.656092311466623e-07, "loss": 0.2527, "step": 2189 }, { "epoch": 0.62, "grad_norm": 2.224388121091182, "learning_rate": 6.647444158229318e-07, "loss": 0.2426, "step": 2190 }, { "epoch": 0.62, "grad_norm": 2.2232829090554005, "learning_rate": 6.638798828974099e-07, "loss": 0.2419, "step": 2191 }, { "epoch": 0.62, "grad_norm": 2.5867977881897732, "learning_rate": 6.630156330983243e-07, "loss": 0.2956, "step": 2192 }, { "epoch": 0.62, "grad_norm": 2.4204940802285067, "learning_rate": 6.621516671536649e-07, "loss": 0.3057, "step": 2193 }, { "epoch": 0.62, "grad_norm": 3.348152749985618, "learning_rate": 6.612879857911824e-07, "loss": 0.294, "step": 2194 }, { "epoch": 0.62, "grad_norm": 2.3451848527753048, "learning_rate": 6.604245897383869e-07, "loss": 0.2748, "step": 2195 }, { "epoch": 0.62, "grad_norm": 2.8676379377626224, "learning_rate": 6.595614797225496e-07, "loss": 0.2689, "step": 2196 }, { "epoch": 0.62, "grad_norm": 2.3611535849479055, "learning_rate": 6.586986564706998e-07, "loss": 0.2612, "step": 2197 }, { "epoch": 0.62, "grad_norm": 2.7435701756217252, "learning_rate": 6.57836120709626e-07, "loss": 0.3083, "step": 2198 }, { "epoch": 0.62, "grad_norm": 2.529965900786767, "learning_rate": 6.569738731658734e-07, "loss": 0.3059, "step": 2199 }, { "epoch": 0.62, "grad_norm": 2.2625128317474608, "learning_rate": 6.56111914565745e-07, "loss": 0.2622, "step": 2200 }, { "epoch": 0.62, "grad_norm": 2.3355623198620297, "learning_rate": 6.552502456353011e-07, "loss": 0.2772, "step": 2201 }, { "epoch": 0.62, "grad_norm": 2.426170884306247, "learning_rate": 6.543888671003572e-07, "loss": 0.2864, "step": 2202 }, { "epoch": 0.62, "grad_norm": 2.40017155684168, "learning_rate": 6.535277796864841e-07, "loss": 0.2741, "step": 2203 }, { "epoch": 0.62, "grad_norm": 2.52362240442778, "learning_rate": 6.526669841190078e-07, "loss": 0.2956, "step": 2204 }, { "epoch": 0.62, "grad_norm": 2.356002953270973, "learning_rate": 6.518064811230082e-07, "loss": 0.2837, "step": 2205 }, { "epoch": 0.63, "grad_norm": 2.6796883261044893, "learning_rate": 6.509462714233193e-07, "loss": 0.3092, "step": 2206 }, { "epoch": 0.63, "grad_norm": 2.3176863434708137, "learning_rate": 6.500863557445273e-07, "loss": 0.2456, "step": 2207 }, { "epoch": 0.63, "grad_norm": 2.6021904172206427, "learning_rate": 6.49226734810971e-07, "loss": 0.2975, "step": 2208 }, { "epoch": 0.63, "grad_norm": 2.4935328971745925, "learning_rate": 6.483674093467408e-07, "loss": 0.2831, "step": 2209 }, { "epoch": 0.63, "grad_norm": 2.3839197112529766, "learning_rate": 6.475083800756791e-07, "loss": 0.2826, "step": 2210 }, { "epoch": 0.63, "grad_norm": 2.299681443388202, "learning_rate": 6.466496477213776e-07, "loss": 0.2215, "step": 2211 }, { "epoch": 0.63, "grad_norm": 2.2836145958696776, "learning_rate": 6.457912130071785e-07, "loss": 0.2875, "step": 2212 }, { "epoch": 0.63, "grad_norm": 2.437927993707982, "learning_rate": 6.449330766561733e-07, "loss": 0.3069, "step": 2213 }, { "epoch": 0.63, "grad_norm": 2.416185295716576, "learning_rate": 6.440752393912015e-07, "loss": 0.2728, "step": 2214 }, { "epoch": 0.63, "grad_norm": 2.2801588482913173, "learning_rate": 6.43217701934852e-07, "loss": 0.2855, "step": 2215 }, { "epoch": 0.63, "grad_norm": 2.2867858493533832, "learning_rate": 6.4236046500946e-07, "loss": 0.2627, "step": 2216 }, { "epoch": 0.63, "grad_norm": 2.587876434898805, "learning_rate": 6.41503529337108e-07, "loss": 0.293, "step": 2217 }, { "epoch": 0.63, "grad_norm": 2.3088016451229136, "learning_rate": 6.406468956396249e-07, "loss": 0.2776, "step": 2218 }, { "epoch": 0.63, "grad_norm": 2.3788309790734488, "learning_rate": 6.397905646385844e-07, "loss": 0.2834, "step": 2219 }, { "epoch": 0.63, "grad_norm": 2.278441350078684, "learning_rate": 6.389345370553064e-07, "loss": 0.2475, "step": 2220 }, { "epoch": 0.63, "grad_norm": 3.9142417532273233, "learning_rate": 6.380788136108546e-07, "loss": 0.2945, "step": 2221 }, { "epoch": 0.63, "grad_norm": 2.344305602286148, "learning_rate": 6.372233950260367e-07, "loss": 0.2612, "step": 2222 }, { "epoch": 0.63, "grad_norm": 2.33848755529304, "learning_rate": 6.363682820214031e-07, "loss": 0.25, "step": 2223 }, { "epoch": 0.63, "grad_norm": 2.429139743401095, "learning_rate": 6.355134753172473e-07, "loss": 0.2767, "step": 2224 }, { "epoch": 0.63, "grad_norm": 2.5080203198990803, "learning_rate": 6.34658975633605e-07, "loss": 0.293, "step": 2225 }, { "epoch": 0.63, "grad_norm": 2.488305071582111, "learning_rate": 6.338047836902527e-07, "loss": 0.2923, "step": 2226 }, { "epoch": 0.63, "grad_norm": 2.6386594684053755, "learning_rate": 6.329509002067079e-07, "loss": 0.2638, "step": 2227 }, { "epoch": 0.63, "grad_norm": 2.499223534801197, "learning_rate": 6.320973259022286e-07, "loss": 0.2789, "step": 2228 }, { "epoch": 0.63, "grad_norm": 2.349727343761741, "learning_rate": 6.312440614958114e-07, "loss": 0.3011, "step": 2229 }, { "epoch": 0.63, "grad_norm": 2.399789289121343, "learning_rate": 6.303911077061937e-07, "loss": 0.2913, "step": 2230 }, { "epoch": 0.63, "grad_norm": 2.468975784925108, "learning_rate": 6.29538465251849e-07, "loss": 0.2514, "step": 2231 }, { "epoch": 0.63, "grad_norm": 2.3046212635405547, "learning_rate": 6.286861348509902e-07, "loss": 0.267, "step": 2232 }, { "epoch": 0.63, "grad_norm": 2.2861395547404557, "learning_rate": 6.278341172215669e-07, "loss": 0.2329, "step": 2233 }, { "epoch": 0.63, "grad_norm": 2.2936158988880835, "learning_rate": 6.269824130812644e-07, "loss": 0.2568, "step": 2234 }, { "epoch": 0.63, "grad_norm": 2.241563640965623, "learning_rate": 6.261310231475054e-07, "loss": 0.2582, "step": 2235 }, { "epoch": 0.63, "grad_norm": 2.4414301771540305, "learning_rate": 6.252799481374472e-07, "loss": 0.287, "step": 2236 }, { "epoch": 0.63, "grad_norm": 2.2552834975225498, "learning_rate": 6.244291887679818e-07, "loss": 0.2436, "step": 2237 }, { "epoch": 0.63, "grad_norm": 2.380287911860163, "learning_rate": 6.235787457557349e-07, "loss": 0.266, "step": 2238 }, { "epoch": 0.63, "grad_norm": 2.3759044308925366, "learning_rate": 6.227286198170662e-07, "loss": 0.299, "step": 2239 }, { "epoch": 0.63, "grad_norm": 2.477400443290136, "learning_rate": 6.218788116680689e-07, "loss": 0.2634, "step": 2240 }, { "epoch": 0.63, "grad_norm": 2.4706236064839318, "learning_rate": 6.210293220245677e-07, "loss": 0.3339, "step": 2241 }, { "epoch": 0.64, "grad_norm": 2.4793553558938775, "learning_rate": 6.201801516021189e-07, "loss": 0.3025, "step": 2242 }, { "epoch": 0.64, "grad_norm": 2.228421467160145, "learning_rate": 6.193313011160103e-07, "loss": 0.2664, "step": 2243 }, { "epoch": 0.64, "grad_norm": 2.437534621664695, "learning_rate": 6.184827712812603e-07, "loss": 0.2625, "step": 2244 }, { "epoch": 0.64, "grad_norm": 2.3956581186100396, "learning_rate": 6.176345628126175e-07, "loss": 0.2898, "step": 2245 }, { "epoch": 0.64, "grad_norm": 2.239063252739338, "learning_rate": 6.167866764245586e-07, "loss": 0.2541, "step": 2246 }, { "epoch": 0.64, "grad_norm": 2.476757025154476, "learning_rate": 6.159391128312899e-07, "loss": 0.3004, "step": 2247 }, { "epoch": 0.64, "grad_norm": 2.2569840619493933, "learning_rate": 6.150918727467454e-07, "loss": 0.275, "step": 2248 }, { "epoch": 0.64, "grad_norm": 2.2133733010199363, "learning_rate": 6.142449568845877e-07, "loss": 0.287, "step": 2249 }, { "epoch": 0.64, "grad_norm": 2.364274466964281, "learning_rate": 6.133983659582047e-07, "loss": 0.2928, "step": 2250 }, { "epoch": 0.64, "grad_norm": 2.531639474882001, "learning_rate": 6.125521006807115e-07, "loss": 0.2825, "step": 2251 }, { "epoch": 0.64, "grad_norm": 2.300776299226663, "learning_rate": 6.11706161764949e-07, "loss": 0.2593, "step": 2252 }, { "epoch": 0.64, "grad_norm": 2.1737940033605505, "learning_rate": 6.10860549923482e-07, "loss": 0.2513, "step": 2253 }, { "epoch": 0.64, "grad_norm": 2.465815246077779, "learning_rate": 6.10015265868602e-07, "loss": 0.2757, "step": 2254 }, { "epoch": 0.64, "grad_norm": 2.2752963971548117, "learning_rate": 6.091703103123222e-07, "loss": 0.255, "step": 2255 }, { "epoch": 0.64, "grad_norm": 2.501046208396167, "learning_rate": 6.083256839663806e-07, "loss": 0.2861, "step": 2256 }, { "epoch": 0.64, "grad_norm": 2.5397860645465875, "learning_rate": 6.074813875422365e-07, "loss": 0.3324, "step": 2257 }, { "epoch": 0.64, "grad_norm": 2.3053544026215484, "learning_rate": 6.066374217510724e-07, "loss": 0.2752, "step": 2258 }, { "epoch": 0.64, "grad_norm": 2.3995880048556217, "learning_rate": 6.057937873037924e-07, "loss": 0.29, "step": 2259 }, { "epoch": 0.64, "grad_norm": 2.523188766633019, "learning_rate": 6.04950484911021e-07, "loss": 0.2783, "step": 2260 }, { "epoch": 0.64, "grad_norm": 2.228431929782206, "learning_rate": 6.041075152831025e-07, "loss": 0.2707, "step": 2261 }, { "epoch": 0.64, "grad_norm": 2.9934974905244562, "learning_rate": 6.032648791301018e-07, "loss": 0.2906, "step": 2262 }, { "epoch": 0.64, "grad_norm": 2.3233178461924853, "learning_rate": 6.024225771618023e-07, "loss": 0.2635, "step": 2263 }, { "epoch": 0.64, "grad_norm": 2.723255393523626, "learning_rate": 6.015806100877069e-07, "loss": 0.2819, "step": 2264 }, { "epoch": 0.64, "grad_norm": 2.571178926895352, "learning_rate": 6.007389786170354e-07, "loss": 0.3109, "step": 2265 }, { "epoch": 0.64, "grad_norm": 2.2559815923268878, "learning_rate": 5.998976834587246e-07, "loss": 0.2704, "step": 2266 }, { "epoch": 0.64, "grad_norm": 2.49947468441889, "learning_rate": 5.990567253214295e-07, "loss": 0.2818, "step": 2267 }, { "epoch": 0.64, "grad_norm": 2.5920054788059637, "learning_rate": 5.98216104913519e-07, "loss": 0.2952, "step": 2268 }, { "epoch": 0.64, "grad_norm": 3.999502386879307, "learning_rate": 5.973758229430805e-07, "loss": 0.2542, "step": 2269 }, { "epoch": 0.64, "grad_norm": 2.282738274536259, "learning_rate": 5.965358801179137e-07, "loss": 0.2721, "step": 2270 }, { "epoch": 0.64, "grad_norm": 2.2313925933260172, "learning_rate": 5.956962771455337e-07, "loss": 0.2787, "step": 2271 }, { "epoch": 0.64, "grad_norm": 2.2704746180326767, "learning_rate": 5.948570147331692e-07, "loss": 0.2731, "step": 2272 }, { "epoch": 0.64, "grad_norm": 2.5502486483761717, "learning_rate": 5.940180935877619e-07, "loss": 0.3083, "step": 2273 }, { "epoch": 0.64, "grad_norm": 2.4309385928987153, "learning_rate": 5.931795144159665e-07, "loss": 0.2857, "step": 2274 }, { "epoch": 0.64, "grad_norm": 2.1658776156389146, "learning_rate": 5.923412779241492e-07, "loss": 0.2644, "step": 2275 }, { "epoch": 0.64, "grad_norm": 2.494804348637084, "learning_rate": 5.91503384818388e-07, "loss": 0.2412, "step": 2276 }, { "epoch": 0.65, "grad_norm": 2.2382077913458502, "learning_rate": 5.906658358044703e-07, "loss": 0.2652, "step": 2277 }, { "epoch": 0.65, "grad_norm": 2.2962584594206747, "learning_rate": 5.89828631587896e-07, "loss": 0.2807, "step": 2278 }, { "epoch": 0.65, "grad_norm": 2.188276662228928, "learning_rate": 5.889917728738724e-07, "loss": 0.2486, "step": 2279 }, { "epoch": 0.65, "grad_norm": 2.527543431519013, "learning_rate": 5.88155260367317e-07, "loss": 0.2352, "step": 2280 }, { "epoch": 0.65, "grad_norm": 2.302003991338731, "learning_rate": 5.873190947728551e-07, "loss": 0.2543, "step": 2281 }, { "epoch": 0.65, "grad_norm": 2.2480317567146004, "learning_rate": 5.864832767948198e-07, "loss": 0.2514, "step": 2282 }, { "epoch": 0.65, "grad_norm": 2.524084873276555, "learning_rate": 5.85647807137252e-07, "loss": 0.2942, "step": 2283 }, { "epoch": 0.65, "grad_norm": 2.369980013342227, "learning_rate": 5.848126865038989e-07, "loss": 0.2793, "step": 2284 }, { "epoch": 0.65, "grad_norm": 2.8317055902232497, "learning_rate": 5.83977915598213e-07, "loss": 0.2724, "step": 2285 }, { "epoch": 0.65, "grad_norm": 2.4188479265192777, "learning_rate": 5.83143495123353e-07, "loss": 0.2642, "step": 2286 }, { "epoch": 0.65, "grad_norm": 2.6498348586511487, "learning_rate": 5.823094257821821e-07, "loss": 0.2817, "step": 2287 }, { "epoch": 0.65, "grad_norm": 2.568128866402004, "learning_rate": 5.814757082772682e-07, "loss": 0.2934, "step": 2288 }, { "epoch": 0.65, "grad_norm": 2.2940019570209067, "learning_rate": 5.806423433108821e-07, "loss": 0.2577, "step": 2289 }, { "epoch": 0.65, "grad_norm": 2.348630798206626, "learning_rate": 5.798093315849983e-07, "loss": 0.2721, "step": 2290 }, { "epoch": 0.65, "grad_norm": 2.394079405948414, "learning_rate": 5.789766738012931e-07, "loss": 0.2801, "step": 2291 }, { "epoch": 0.65, "grad_norm": 2.6282642371955447, "learning_rate": 5.781443706611454e-07, "loss": 0.289, "step": 2292 }, { "epoch": 0.65, "grad_norm": 2.3702665835924117, "learning_rate": 5.773124228656348e-07, "loss": 0.2578, "step": 2293 }, { "epoch": 0.65, "grad_norm": 2.8168042286446826, "learning_rate": 5.764808311155418e-07, "loss": 0.3191, "step": 2294 }, { "epoch": 0.65, "grad_norm": 2.4400086616895784, "learning_rate": 5.756495961113468e-07, "loss": 0.2866, "step": 2295 }, { "epoch": 0.65, "grad_norm": 2.304434447879119, "learning_rate": 5.748187185532305e-07, "loss": 0.2867, "step": 2296 }, { "epoch": 0.65, "grad_norm": 2.4909981692133747, "learning_rate": 5.739881991410707e-07, "loss": 0.2543, "step": 2297 }, { "epoch": 0.65, "grad_norm": 2.2463457224543433, "learning_rate": 5.731580385744457e-07, "loss": 0.2313, "step": 2298 }, { "epoch": 0.65, "grad_norm": 2.857451260559649, "learning_rate": 5.723282375526302e-07, "loss": 0.2663, "step": 2299 }, { "epoch": 0.65, "grad_norm": 2.2112224349538265, "learning_rate": 5.714987967745967e-07, "loss": 0.2684, "step": 2300 }, { "epoch": 0.65, "grad_norm": 2.3854594089360917, "learning_rate": 5.706697169390134e-07, "loss": 0.2865, "step": 2301 }, { "epoch": 0.65, "grad_norm": 2.6668868136628197, "learning_rate": 5.698409987442448e-07, "loss": 0.257, "step": 2302 }, { "epoch": 0.65, "grad_norm": 2.137828218966821, "learning_rate": 5.690126428883515e-07, "loss": 0.276, "step": 2303 }, { "epoch": 0.65, "grad_norm": 2.486527199626064, "learning_rate": 5.681846500690884e-07, "loss": 0.2888, "step": 2304 }, { "epoch": 0.65, "grad_norm": 2.35238133058172, "learning_rate": 5.673570209839045e-07, "loss": 0.2749, "step": 2305 }, { "epoch": 0.65, "grad_norm": 3.2026123661227297, "learning_rate": 5.66529756329942e-07, "loss": 0.2537, "step": 2306 }, { "epoch": 0.65, "grad_norm": 2.2395741026150864, "learning_rate": 5.657028568040365e-07, "loss": 0.2884, "step": 2307 }, { "epoch": 0.65, "grad_norm": 2.4458748873204863, "learning_rate": 5.64876323102717e-07, "loss": 0.2608, "step": 2308 }, { "epoch": 0.65, "grad_norm": 2.7200316283550663, "learning_rate": 5.640501559222034e-07, "loss": 0.278, "step": 2309 }, { "epoch": 0.65, "grad_norm": 3.209972703940308, "learning_rate": 5.63224355958406e-07, "loss": 0.2916, "step": 2310 }, { "epoch": 0.65, "grad_norm": 2.5998394218333085, "learning_rate": 5.623989239069274e-07, "loss": 0.3057, "step": 2311 }, { "epoch": 0.66, "grad_norm": 2.38565340315908, "learning_rate": 5.615738604630591e-07, "loss": 0.268, "step": 2312 }, { "epoch": 0.66, "grad_norm": 2.568508776646103, "learning_rate": 5.607491663217838e-07, "loss": 0.3047, "step": 2313 }, { "epoch": 0.66, "grad_norm": 2.3544146184023838, "learning_rate": 5.599248421777707e-07, "loss": 0.292, "step": 2314 }, { "epoch": 0.66, "grad_norm": 2.511187292747781, "learning_rate": 5.591008887253792e-07, "loss": 0.2809, "step": 2315 }, { "epoch": 0.66, "grad_norm": 2.3207736387674878, "learning_rate": 5.582773066586552e-07, "loss": 0.2698, "step": 2316 }, { "epoch": 0.66, "grad_norm": 2.396898601611306, "learning_rate": 5.574540966713337e-07, "loss": 0.2789, "step": 2317 }, { "epoch": 0.66, "grad_norm": 2.5437105260347734, "learning_rate": 5.566312594568339e-07, "loss": 0.2654, "step": 2318 }, { "epoch": 0.66, "grad_norm": 2.389316829041838, "learning_rate": 5.558087957082623e-07, "loss": 0.2631, "step": 2319 }, { "epoch": 0.66, "grad_norm": 2.3657439718027082, "learning_rate": 5.549867061184108e-07, "loss": 0.2951, "step": 2320 }, { "epoch": 0.66, "grad_norm": 2.613468479680014, "learning_rate": 5.541649913797558e-07, "loss": 0.2997, "step": 2321 }, { "epoch": 0.66, "grad_norm": 2.7800748740297463, "learning_rate": 5.533436521844581e-07, "loss": 0.271, "step": 2322 }, { "epoch": 0.66, "grad_norm": 2.4391679900312337, "learning_rate": 5.525226892243623e-07, "loss": 0.2932, "step": 2323 }, { "epoch": 0.66, "grad_norm": 2.276031842799922, "learning_rate": 5.517021031909958e-07, "loss": 0.2882, "step": 2324 }, { "epoch": 0.66, "grad_norm": 2.400813584246261, "learning_rate": 5.508818947755686e-07, "loss": 0.315, "step": 2325 }, { "epoch": 0.66, "grad_norm": 2.3432150318418765, "learning_rate": 5.500620646689728e-07, "loss": 0.258, "step": 2326 }, { "epoch": 0.66, "grad_norm": 2.34350737859966, "learning_rate": 5.492426135617815e-07, "loss": 0.2588, "step": 2327 }, { "epoch": 0.66, "grad_norm": 2.461391081740737, "learning_rate": 5.484235421442491e-07, "loss": 0.2932, "step": 2328 }, { "epoch": 0.66, "grad_norm": 2.2771199211489286, "learning_rate": 5.476048511063095e-07, "loss": 0.2641, "step": 2329 }, { "epoch": 0.66, "grad_norm": 2.5021933751522076, "learning_rate": 5.467865411375765e-07, "loss": 0.282, "step": 2330 }, { "epoch": 0.66, "grad_norm": 2.32908541394442, "learning_rate": 5.459686129273432e-07, "loss": 0.2632, "step": 2331 }, { "epoch": 0.66, "grad_norm": 2.67185336758561, "learning_rate": 5.451510671645806e-07, "loss": 0.2974, "step": 2332 }, { "epoch": 0.66, "grad_norm": 2.3283227432999998, "learning_rate": 5.443339045379379e-07, "loss": 0.2605, "step": 2333 }, { "epoch": 0.66, "grad_norm": 2.419537722246254, "learning_rate": 5.435171257357416e-07, "loss": 0.2619, "step": 2334 }, { "epoch": 0.66, "grad_norm": 2.3691225216183653, "learning_rate": 5.427007314459948e-07, "loss": 0.2878, "step": 2335 }, { "epoch": 0.66, "grad_norm": 2.4319599179402496, "learning_rate": 5.418847223563761e-07, "loss": 0.2798, "step": 2336 }, { "epoch": 0.66, "grad_norm": 2.3038917195379143, "learning_rate": 5.410690991542407e-07, "loss": 0.2465, "step": 2337 }, { "epoch": 0.66, "grad_norm": 2.3787734383661636, "learning_rate": 5.402538625266183e-07, "loss": 0.2965, "step": 2338 }, { "epoch": 0.66, "grad_norm": 2.2196406147403054, "learning_rate": 5.394390131602132e-07, "loss": 0.2643, "step": 2339 }, { "epoch": 0.66, "grad_norm": 2.4182567726274042, "learning_rate": 5.386245517414026e-07, "loss": 0.245, "step": 2340 }, { "epoch": 0.66, "grad_norm": 2.343449748727217, "learning_rate": 5.378104789562373e-07, "loss": 0.2887, "step": 2341 }, { "epoch": 0.66, "grad_norm": 2.2836474205993706, "learning_rate": 5.36996795490442e-07, "loss": 0.2683, "step": 2342 }, { "epoch": 0.66, "grad_norm": 2.3647325537081225, "learning_rate": 5.361835020294122e-07, "loss": 0.2615, "step": 2343 }, { "epoch": 0.66, "grad_norm": 2.8029499896540897, "learning_rate": 5.353705992582146e-07, "loss": 0.2397, "step": 2344 }, { "epoch": 0.66, "grad_norm": 2.3598025459643845, "learning_rate": 5.345580878615877e-07, "loss": 0.2764, "step": 2345 }, { "epoch": 0.66, "grad_norm": 2.2772605562966164, "learning_rate": 5.337459685239394e-07, "loss": 0.2361, "step": 2346 }, { "epoch": 0.67, "grad_norm": 2.398278736692514, "learning_rate": 5.329342419293488e-07, "loss": 0.277, "step": 2347 }, { "epoch": 0.67, "grad_norm": 2.4037681130844675, "learning_rate": 5.321229087615634e-07, "loss": 0.2763, "step": 2348 }, { "epoch": 0.67, "grad_norm": 2.485457451422128, "learning_rate": 5.313119697039984e-07, "loss": 0.2885, "step": 2349 }, { "epoch": 0.67, "grad_norm": 2.3282359187966373, "learning_rate": 5.305014254397377e-07, "loss": 0.2649, "step": 2350 }, { "epoch": 0.67, "grad_norm": 2.38908486454763, "learning_rate": 5.296912766515338e-07, "loss": 0.2835, "step": 2351 }, { "epoch": 0.67, "grad_norm": 2.592192298182752, "learning_rate": 5.288815240218048e-07, "loss": 0.3013, "step": 2352 }, { "epoch": 0.67, "grad_norm": 2.35159951282308, "learning_rate": 5.280721682326348e-07, "loss": 0.2669, "step": 2353 }, { "epoch": 0.67, "grad_norm": 2.314369465048757, "learning_rate": 5.272632099657743e-07, "loss": 0.2702, "step": 2354 }, { "epoch": 0.67, "grad_norm": 2.5082008674395184, "learning_rate": 5.264546499026387e-07, "loss": 0.2712, "step": 2355 }, { "epoch": 0.67, "grad_norm": 2.1962925179788866, "learning_rate": 5.256464887243094e-07, "loss": 0.2556, "step": 2356 }, { "epoch": 0.67, "grad_norm": 2.243902743791842, "learning_rate": 5.248387271115291e-07, "loss": 0.2622, "step": 2357 }, { "epoch": 0.67, "grad_norm": 2.281956857693016, "learning_rate": 5.240313657447057e-07, "loss": 0.2766, "step": 2358 }, { "epoch": 0.67, "grad_norm": 2.419361403863879, "learning_rate": 5.232244053039099e-07, "loss": 0.2697, "step": 2359 }, { "epoch": 0.67, "grad_norm": 2.3917488210639513, "learning_rate": 5.224178464688741e-07, "loss": 0.2663, "step": 2360 }, { "epoch": 0.67, "grad_norm": 2.331041543408664, "learning_rate": 5.216116899189928e-07, "loss": 0.2658, "step": 2361 }, { "epoch": 0.67, "grad_norm": 2.821521842305285, "learning_rate": 5.208059363333217e-07, "loss": 0.296, "step": 2362 }, { "epoch": 0.67, "grad_norm": 2.868996764673546, "learning_rate": 5.200005863905767e-07, "loss": 0.2982, "step": 2363 }, { "epoch": 0.67, "grad_norm": 2.2425213278054152, "learning_rate": 5.191956407691343e-07, "loss": 0.2369, "step": 2364 }, { "epoch": 0.67, "grad_norm": 2.332277471029029, "learning_rate": 5.183911001470295e-07, "loss": 0.2435, "step": 2365 }, { "epoch": 0.67, "grad_norm": 2.493223595181339, "learning_rate": 5.17586965201957e-07, "loss": 0.268, "step": 2366 }, { "epoch": 0.67, "grad_norm": 2.4352432752453614, "learning_rate": 5.167832366112694e-07, "loss": 0.2768, "step": 2367 }, { "epoch": 0.67, "grad_norm": 2.461278109640111, "learning_rate": 5.159799150519772e-07, "loss": 0.3012, "step": 2368 }, { "epoch": 0.67, "grad_norm": 2.146603860090843, "learning_rate": 5.151770012007479e-07, "loss": 0.2744, "step": 2369 }, { "epoch": 0.67, "grad_norm": 2.2789999167061885, "learning_rate": 5.143744957339056e-07, "loss": 0.2775, "step": 2370 }, { "epoch": 0.67, "grad_norm": 2.1648708799970486, "learning_rate": 5.135723993274303e-07, "loss": 0.2581, "step": 2371 }, { "epoch": 0.67, "grad_norm": 2.40279231393275, "learning_rate": 5.127707126569576e-07, "loss": 0.2625, "step": 2372 }, { "epoch": 0.67, "grad_norm": 2.879779242816951, "learning_rate": 5.11969436397778e-07, "loss": 0.2719, "step": 2373 }, { "epoch": 0.67, "grad_norm": 2.4951831674750053, "learning_rate": 5.111685712248363e-07, "loss": 0.2983, "step": 2374 }, { "epoch": 0.67, "grad_norm": 2.3148481953963143, "learning_rate": 5.103681178127302e-07, "loss": 0.2609, "step": 2375 }, { "epoch": 0.67, "grad_norm": 2.4355470361980087, "learning_rate": 5.095680768357122e-07, "loss": 0.268, "step": 2376 }, { "epoch": 0.67, "grad_norm": 2.957762249807943, "learning_rate": 5.087684489676861e-07, "loss": 0.2723, "step": 2377 }, { "epoch": 0.67, "grad_norm": 2.3268375859011248, "learning_rate": 5.079692348822085e-07, "loss": 0.2763, "step": 2378 }, { "epoch": 0.67, "grad_norm": 3.131960016007764, "learning_rate": 5.071704352524862e-07, "loss": 0.2984, "step": 2379 }, { "epoch": 0.67, "grad_norm": 2.40458029742982, "learning_rate": 5.06372050751378e-07, "loss": 0.2836, "step": 2380 }, { "epoch": 0.67, "grad_norm": 2.7074747538153594, "learning_rate": 5.055740820513932e-07, "loss": 0.2832, "step": 2381 }, { "epoch": 0.67, "grad_norm": 2.2862593178591277, "learning_rate": 5.047765298246907e-07, "loss": 0.2952, "step": 2382 }, { "epoch": 0.68, "grad_norm": 2.3946399410902326, "learning_rate": 5.039793947430773e-07, "loss": 0.2616, "step": 2383 }, { "epoch": 0.68, "grad_norm": 2.3943577455752307, "learning_rate": 5.031826774780097e-07, "loss": 0.2822, "step": 2384 }, { "epoch": 0.68, "grad_norm": 2.3088941950755317, "learning_rate": 5.023863787005929e-07, "loss": 0.2454, "step": 2385 }, { "epoch": 0.68, "grad_norm": 2.28104166201968, "learning_rate": 5.015904990815792e-07, "loss": 0.2674, "step": 2386 }, { "epoch": 0.68, "grad_norm": 2.4601117984407956, "learning_rate": 5.007950392913662e-07, "loss": 0.3097, "step": 2387 }, { "epoch": 0.68, "grad_norm": 2.292624142786809, "learning_rate": 5.000000000000002e-07, "loss": 0.2736, "step": 2388 }, { "epoch": 0.68, "grad_norm": 2.4788897424360155, "learning_rate": 4.992053818771714e-07, "loss": 0.2613, "step": 2389 }, { "epoch": 0.68, "grad_norm": 2.359979008814565, "learning_rate": 4.984111855922176e-07, "loss": 0.2764, "step": 2390 }, { "epoch": 0.68, "grad_norm": 2.2760545050797067, "learning_rate": 4.976174118141185e-07, "loss": 0.2722, "step": 2391 }, { "epoch": 0.68, "grad_norm": 2.4792421447033077, "learning_rate": 4.968240612114995e-07, "loss": 0.2531, "step": 2392 }, { "epoch": 0.68, "grad_norm": 2.3183540813170143, "learning_rate": 4.960311344526292e-07, "loss": 0.2784, "step": 2393 }, { "epoch": 0.68, "grad_norm": 2.286214631829258, "learning_rate": 4.952386322054188e-07, "loss": 0.2646, "step": 2394 }, { "epoch": 0.68, "grad_norm": 2.403792277262232, "learning_rate": 4.944465551374238e-07, "loss": 0.2963, "step": 2395 }, { "epoch": 0.68, "grad_norm": 2.229517392197221, "learning_rate": 4.936549039158385e-07, "loss": 0.2491, "step": 2396 }, { "epoch": 0.68, "grad_norm": 2.344675961661458, "learning_rate": 4.928636792075007e-07, "loss": 0.2838, "step": 2397 }, { "epoch": 0.68, "grad_norm": 2.208281211784219, "learning_rate": 4.920728816788883e-07, "loss": 0.2643, "step": 2398 }, { "epoch": 0.68, "grad_norm": 2.292389280601437, "learning_rate": 4.912825119961194e-07, "loss": 0.2835, "step": 2399 }, { "epoch": 0.68, "grad_norm": 2.458712202984134, "learning_rate": 4.904925708249516e-07, "loss": 0.2845, "step": 2400 }, { "epoch": 0.68, "grad_norm": 2.3774224537052615, "learning_rate": 4.897030588307816e-07, "loss": 0.2813, "step": 2401 }, { "epoch": 0.68, "grad_norm": 2.297106016389134, "learning_rate": 4.889139766786447e-07, "loss": 0.2957, "step": 2402 }, { "epoch": 0.68, "grad_norm": 2.4943961815678883, "learning_rate": 4.881253250332141e-07, "loss": 0.2811, "step": 2403 }, { "epoch": 0.68, "grad_norm": 2.319088781552554, "learning_rate": 4.873371045588001e-07, "loss": 0.2814, "step": 2404 }, { "epoch": 0.68, "grad_norm": 2.352193260486395, "learning_rate": 4.865493159193504e-07, "loss": 0.2689, "step": 2405 }, { "epoch": 0.68, "grad_norm": 2.5413080645396913, "learning_rate": 4.857619597784482e-07, "loss": 0.3134, "step": 2406 }, { "epoch": 0.68, "grad_norm": 2.420124609671713, "learning_rate": 4.84975036799313e-07, "loss": 0.2668, "step": 2407 }, { "epoch": 0.68, "grad_norm": 2.8124452637866133, "learning_rate": 4.841885476447995e-07, "loss": 0.2866, "step": 2408 }, { "epoch": 0.68, "grad_norm": 2.258369093161929, "learning_rate": 4.834024929773956e-07, "loss": 0.2565, "step": 2409 }, { "epoch": 0.68, "grad_norm": 2.2445143082198666, "learning_rate": 4.826168734592253e-07, "loss": 0.2663, "step": 2410 }, { "epoch": 0.68, "grad_norm": 2.439178512532609, "learning_rate": 4.818316897520449e-07, "loss": 0.2866, "step": 2411 }, { "epoch": 0.68, "grad_norm": 2.4628414329677835, "learning_rate": 4.810469425172439e-07, "loss": 0.2673, "step": 2412 }, { "epoch": 0.68, "grad_norm": 2.2814847882421305, "learning_rate": 4.802626324158432e-07, "loss": 0.2663, "step": 2413 }, { "epoch": 0.68, "grad_norm": 2.436509033085469, "learning_rate": 4.794787601084965e-07, "loss": 0.2738, "step": 2414 }, { "epoch": 0.68, "grad_norm": 2.347829927437003, "learning_rate": 4.786953262554891e-07, "loss": 0.2818, "step": 2415 }, { "epoch": 0.68, "grad_norm": 2.2396757844804287, "learning_rate": 4.779123315167361e-07, "loss": 0.2528, "step": 2416 }, { "epoch": 0.68, "grad_norm": 2.3879035459539453, "learning_rate": 4.771297765517833e-07, "loss": 0.264, "step": 2417 }, { "epoch": 0.69, "grad_norm": 2.413721326849401, "learning_rate": 4.763476620198047e-07, "loss": 0.2489, "step": 2418 }, { "epoch": 0.69, "grad_norm": 2.353494934214268, "learning_rate": 4.755659885796054e-07, "loss": 0.2713, "step": 2419 }, { "epoch": 0.69, "grad_norm": 2.2907994697079594, "learning_rate": 4.747847568896177e-07, "loss": 0.2749, "step": 2420 }, { "epoch": 0.69, "grad_norm": 2.340449182606632, "learning_rate": 4.740039676079022e-07, "loss": 0.295, "step": 2421 }, { "epoch": 0.69, "grad_norm": 2.2745448209728005, "learning_rate": 4.73223621392146e-07, "loss": 0.2573, "step": 2422 }, { "epoch": 0.69, "grad_norm": 2.326430648936994, "learning_rate": 4.724437188996637e-07, "loss": 0.2724, "step": 2423 }, { "epoch": 0.69, "grad_norm": 2.574729677370753, "learning_rate": 4.716642607873967e-07, "loss": 0.3077, "step": 2424 }, { "epoch": 0.69, "grad_norm": 2.368767987272835, "learning_rate": 4.708852477119116e-07, "loss": 0.2912, "step": 2425 }, { "epoch": 0.69, "grad_norm": 2.327984474192384, "learning_rate": 4.7010668032939925e-07, "loss": 0.2689, "step": 2426 }, { "epoch": 0.69, "grad_norm": 2.3809594525501567, "learning_rate": 4.6932855929567606e-07, "loss": 0.2723, "step": 2427 }, { "epoch": 0.69, "grad_norm": 2.2477212537841136, "learning_rate": 4.6855088526618204e-07, "loss": 0.2677, "step": 2428 }, { "epoch": 0.69, "grad_norm": 2.8939219591101, "learning_rate": 4.6777365889598176e-07, "loss": 0.2546, "step": 2429 }, { "epoch": 0.69, "grad_norm": 2.3686432161421394, "learning_rate": 4.6699688083976085e-07, "loss": 0.2646, "step": 2430 }, { "epoch": 0.69, "grad_norm": 2.425030952188078, "learning_rate": 4.662205517518286e-07, "loss": 0.2732, "step": 2431 }, { "epoch": 0.69, "grad_norm": 2.50809725901002, "learning_rate": 4.6544467228611584e-07, "loss": 0.2584, "step": 2432 }, { "epoch": 0.69, "grad_norm": 2.479077560409588, "learning_rate": 4.646692430961744e-07, "loss": 0.2749, "step": 2433 }, { "epoch": 0.69, "grad_norm": 2.7200829790390317, "learning_rate": 4.6389426483517736e-07, "loss": 0.2805, "step": 2434 }, { "epoch": 0.69, "grad_norm": 2.3144164267512886, "learning_rate": 4.631197381559173e-07, "loss": 0.2975, "step": 2435 }, { "epoch": 0.69, "grad_norm": 2.459976750375668, "learning_rate": 4.6234566371080697e-07, "loss": 0.2956, "step": 2436 }, { "epoch": 0.69, "grad_norm": 2.4508229207793812, "learning_rate": 4.6157204215187795e-07, "loss": 0.2788, "step": 2437 }, { "epoch": 0.69, "grad_norm": 2.418521332143917, "learning_rate": 4.6079887413078034e-07, "loss": 0.2774, "step": 2438 }, { "epoch": 0.69, "grad_norm": 2.3565178114840957, "learning_rate": 4.6002616029878226e-07, "loss": 0.2461, "step": 2439 }, { "epoch": 0.69, "grad_norm": 2.318714557506511, "learning_rate": 4.5925390130676913e-07, "loss": 0.2673, "step": 2440 }, { "epoch": 0.69, "grad_norm": 2.570508945139977, "learning_rate": 4.584820978052434e-07, "loss": 0.3228, "step": 2441 }, { "epoch": 0.69, "grad_norm": 2.426739836009848, "learning_rate": 4.5771075044432385e-07, "loss": 0.2663, "step": 2442 }, { "epoch": 0.69, "grad_norm": 2.5163005062865373, "learning_rate": 4.5693985987374475e-07, "loss": 0.3013, "step": 2443 }, { "epoch": 0.69, "grad_norm": 2.397302513009581, "learning_rate": 4.5616942674285596e-07, "loss": 0.2689, "step": 2444 }, { "epoch": 0.69, "grad_norm": 2.384358665290794, "learning_rate": 4.553994517006219e-07, "loss": 0.274, "step": 2445 }, { "epoch": 0.69, "grad_norm": 2.3780130465359197, "learning_rate": 4.54629935395621e-07, "loss": 0.2703, "step": 2446 }, { "epoch": 0.69, "grad_norm": 2.41175660090204, "learning_rate": 4.5386087847604583e-07, "loss": 0.2761, "step": 2447 }, { "epoch": 0.69, "grad_norm": 2.2308940957310397, "learning_rate": 4.5309228158970027e-07, "loss": 0.2744, "step": 2448 }, { "epoch": 0.69, "grad_norm": 2.2622794723399196, "learning_rate": 4.523241453840033e-07, "loss": 0.2634, "step": 2449 }, { "epoch": 0.69, "grad_norm": 2.4892472914365382, "learning_rate": 4.51556470505984e-07, "loss": 0.251, "step": 2450 }, { "epoch": 0.69, "grad_norm": 2.5240215459027504, "learning_rate": 4.507892576022838e-07, "loss": 0.2355, "step": 2451 }, { "epoch": 0.69, "grad_norm": 2.206020087049321, "learning_rate": 4.500225073191539e-07, "loss": 0.2829, "step": 2452 }, { "epoch": 0.7, "grad_norm": 2.254017885293891, "learning_rate": 4.4925622030245645e-07, "loss": 0.2649, "step": 2453 }, { "epoch": 0.7, "grad_norm": 2.3498609298835365, "learning_rate": 4.484903971976641e-07, "loss": 0.2857, "step": 2454 }, { "epoch": 0.7, "grad_norm": 2.0971087145254033, "learning_rate": 4.4772503864985813e-07, "loss": 0.2547, "step": 2455 }, { "epoch": 0.7, "grad_norm": 2.321955789347521, "learning_rate": 4.469601453037276e-07, "loss": 0.2653, "step": 2456 }, { "epoch": 0.7, "grad_norm": 2.205208474190725, "learning_rate": 4.4619571780357046e-07, "loss": 0.2622, "step": 2457 }, { "epoch": 0.7, "grad_norm": 2.2349667352343308, "learning_rate": 4.4543175679329337e-07, "loss": 0.2675, "step": 2458 }, { "epoch": 0.7, "grad_norm": 2.49822879969086, "learning_rate": 4.4466826291640867e-07, "loss": 0.2621, "step": 2459 }, { "epoch": 0.7, "grad_norm": 2.338010522791837, "learning_rate": 4.439052368160351e-07, "loss": 0.2782, "step": 2460 }, { "epoch": 0.7, "grad_norm": 2.5150398161628766, "learning_rate": 4.43142679134898e-07, "loss": 0.2697, "step": 2461 }, { "epoch": 0.7, "grad_norm": 2.324998946630601, "learning_rate": 4.4238059051532774e-07, "loss": 0.2486, "step": 2462 }, { "epoch": 0.7, "grad_norm": 2.748258160226059, "learning_rate": 4.4161897159926044e-07, "loss": 0.2896, "step": 2463 }, { "epoch": 0.7, "grad_norm": 2.5017394373991007, "learning_rate": 4.4085782302823604e-07, "loss": 0.2904, "step": 2464 }, { "epoch": 0.7, "grad_norm": 2.378543367007354, "learning_rate": 4.400971454433975e-07, "loss": 0.2693, "step": 2465 }, { "epoch": 0.7, "grad_norm": 2.422935392437184, "learning_rate": 4.39336939485492e-07, "loss": 0.2735, "step": 2466 }, { "epoch": 0.7, "grad_norm": 2.3311674451261903, "learning_rate": 4.3857720579486887e-07, "loss": 0.2516, "step": 2467 }, { "epoch": 0.7, "grad_norm": 2.36415216404044, "learning_rate": 4.3781794501148105e-07, "loss": 0.2804, "step": 2468 }, { "epoch": 0.7, "grad_norm": 2.6607955171263975, "learning_rate": 4.3705915777488113e-07, "loss": 0.2872, "step": 2469 }, { "epoch": 0.7, "grad_norm": 2.4385599819583006, "learning_rate": 4.363008447242239e-07, "loss": 0.3045, "step": 2470 }, { "epoch": 0.7, "grad_norm": 2.398963114273932, "learning_rate": 4.355430064982646e-07, "loss": 0.2633, "step": 2471 }, { "epoch": 0.7, "grad_norm": 2.4263112035637553, "learning_rate": 4.3478564373535844e-07, "loss": 0.2872, "step": 2472 }, { "epoch": 0.7, "grad_norm": 2.2860756043193895, "learning_rate": 4.3402875707346033e-07, "loss": 0.2481, "step": 2473 }, { "epoch": 0.7, "grad_norm": 2.3064939908293614, "learning_rate": 4.3327234715012373e-07, "loss": 0.3014, "step": 2474 }, { "epoch": 0.7, "grad_norm": 2.3342857525505023, "learning_rate": 4.3251641460250086e-07, "loss": 0.2614, "step": 2475 }, { "epoch": 0.7, "grad_norm": 2.3856176203445134, "learning_rate": 4.3176096006734175e-07, "loss": 0.2783, "step": 2476 }, { "epoch": 0.7, "grad_norm": 2.3528075219115725, "learning_rate": 4.3100598418099377e-07, "loss": 0.2615, "step": 2477 }, { "epoch": 0.7, "grad_norm": 2.4978626002403077, "learning_rate": 4.30251487579401e-07, "loss": 0.2893, "step": 2478 }, { "epoch": 0.7, "grad_norm": 2.3772114464772973, "learning_rate": 4.2949747089810407e-07, "loss": 0.277, "step": 2479 }, { "epoch": 0.7, "grad_norm": 2.532502965255537, "learning_rate": 4.2874393477223913e-07, "loss": 0.283, "step": 2480 }, { "epoch": 0.7, "grad_norm": 2.433748637942874, "learning_rate": 4.279908798365378e-07, "loss": 0.2877, "step": 2481 }, { "epoch": 0.7, "grad_norm": 2.3727980561715913, "learning_rate": 4.272383067253253e-07, "loss": 0.2741, "step": 2482 }, { "epoch": 0.7, "grad_norm": 2.3850416527902296, "learning_rate": 4.264862160725229e-07, "loss": 0.2602, "step": 2483 }, { "epoch": 0.7, "grad_norm": 2.513270043584115, "learning_rate": 4.25734608511644e-07, "loss": 0.2897, "step": 2484 }, { "epoch": 0.7, "grad_norm": 2.350862190964246, "learning_rate": 4.2498348467579547e-07, "loss": 0.2748, "step": 2485 }, { "epoch": 0.7, "grad_norm": 2.197139592567514, "learning_rate": 4.2423284519767735e-07, "loss": 0.2445, "step": 2486 }, { "epoch": 0.7, "grad_norm": 2.3065943115838583, "learning_rate": 4.2348269070957977e-07, "loss": 0.2764, "step": 2487 }, { "epoch": 0.7, "grad_norm": 2.3619768477901895, "learning_rate": 4.22733021843387e-07, "loss": 0.3006, "step": 2488 }, { "epoch": 0.71, "grad_norm": 2.207189526645788, "learning_rate": 4.2198383923057224e-07, "loss": 0.273, "step": 2489 }, { "epoch": 0.71, "grad_norm": 2.358662330834118, "learning_rate": 4.212351435022005e-07, "loss": 0.291, "step": 2490 }, { "epoch": 0.71, "grad_norm": 2.721161217426416, "learning_rate": 4.2048693528892455e-07, "loss": 0.2514, "step": 2491 }, { "epoch": 0.71, "grad_norm": 2.41417573342347, "learning_rate": 4.197392152209892e-07, "loss": 0.2955, "step": 2492 }, { "epoch": 0.71, "grad_norm": 2.4500824207202223, "learning_rate": 4.189919839282264e-07, "loss": 0.2735, "step": 2493 }, { "epoch": 0.71, "grad_norm": 2.5618427216529427, "learning_rate": 4.1824524204005706e-07, "loss": 0.2856, "step": 2494 }, { "epoch": 0.71, "grad_norm": 2.380341795132665, "learning_rate": 4.1749899018548885e-07, "loss": 0.2561, "step": 2495 }, { "epoch": 0.71, "grad_norm": 2.144872498829846, "learning_rate": 4.1675322899311736e-07, "loss": 0.2487, "step": 2496 }, { "epoch": 0.71, "grad_norm": 2.2617982886035275, "learning_rate": 4.1600795909112564e-07, "loss": 0.2319, "step": 2497 }, { "epoch": 0.71, "grad_norm": 2.382061749693088, "learning_rate": 4.152631811072822e-07, "loss": 0.3004, "step": 2498 }, { "epoch": 0.71, "grad_norm": 2.4247661850403164, "learning_rate": 4.145188956689405e-07, "loss": 0.3104, "step": 2499 }, { "epoch": 0.71, "grad_norm": 2.3290005093252626, "learning_rate": 4.137751034030399e-07, "loss": 0.2591, "step": 2500 }, { "epoch": 0.71, "grad_norm": 2.276287989665381, "learning_rate": 4.130318049361039e-07, "loss": 0.2786, "step": 2501 }, { "epoch": 0.71, "grad_norm": 2.3270973591183304, "learning_rate": 4.1228900089424155e-07, "loss": 0.2857, "step": 2502 }, { "epoch": 0.71, "grad_norm": 2.456702277321644, "learning_rate": 4.1154669190314307e-07, "loss": 0.2732, "step": 2503 }, { "epoch": 0.71, "grad_norm": 2.306313481934988, "learning_rate": 4.1080487858808334e-07, "loss": 0.2913, "step": 2504 }, { "epoch": 0.71, "grad_norm": 2.1824814739733234, "learning_rate": 4.10063561573919e-07, "loss": 0.2578, "step": 2505 }, { "epoch": 0.71, "grad_norm": 2.3460665199937334, "learning_rate": 4.0932274148508863e-07, "loss": 0.2752, "step": 2506 }, { "epoch": 0.71, "grad_norm": 2.3729008838322247, "learning_rate": 4.085824189456135e-07, "loss": 0.2646, "step": 2507 }, { "epoch": 0.71, "grad_norm": 2.423768847342455, "learning_rate": 4.0784259457909363e-07, "loss": 0.2674, "step": 2508 }, { "epoch": 0.71, "grad_norm": 2.430893115112683, "learning_rate": 4.071032690087111e-07, "loss": 0.2574, "step": 2509 }, { "epoch": 0.71, "grad_norm": 2.4690843164366507, "learning_rate": 4.0636444285722684e-07, "loss": 0.2577, "step": 2510 }, { "epoch": 0.71, "grad_norm": 2.293998049058781, "learning_rate": 4.056261167469818e-07, "loss": 0.2649, "step": 2511 }, { "epoch": 0.71, "grad_norm": 2.295221692402047, "learning_rate": 4.048882912998953e-07, "loss": 0.2805, "step": 2512 }, { "epoch": 0.71, "grad_norm": 2.2638696374900342, "learning_rate": 4.0415096713746523e-07, "loss": 0.249, "step": 2513 }, { "epoch": 0.71, "grad_norm": 2.286203414947763, "learning_rate": 4.0341414488076697e-07, "loss": 0.2508, "step": 2514 }, { "epoch": 0.71, "grad_norm": 2.5829535270097934, "learning_rate": 4.026778251504532e-07, "loss": 0.291, "step": 2515 }, { "epoch": 0.71, "grad_norm": 2.5086684577605354, "learning_rate": 4.0194200856675333e-07, "loss": 0.2999, "step": 2516 }, { "epoch": 0.71, "grad_norm": 2.360608262561292, "learning_rate": 4.0120669574947297e-07, "loss": 0.2708, "step": 2517 }, { "epoch": 0.71, "grad_norm": 2.2937174901208697, "learning_rate": 4.0047188731799343e-07, "loss": 0.265, "step": 2518 }, { "epoch": 0.71, "grad_norm": 2.3099838395487176, "learning_rate": 3.99737583891271e-07, "loss": 0.2728, "step": 2519 }, { "epoch": 0.71, "grad_norm": 2.574811131070446, "learning_rate": 3.9900378608783703e-07, "loss": 0.2842, "step": 2520 }, { "epoch": 0.71, "grad_norm": 2.524084353539817, "learning_rate": 3.982704945257956e-07, "loss": 0.2706, "step": 2521 }, { "epoch": 0.71, "grad_norm": 2.5793952681688364, "learning_rate": 3.9753770982282654e-07, "loss": 0.265, "step": 2522 }, { "epoch": 0.71, "grad_norm": 2.175096353388101, "learning_rate": 3.9680543259618103e-07, "loss": 0.2393, "step": 2523 }, { "epoch": 0.72, "grad_norm": 2.2034119506106253, "learning_rate": 3.960736634626838e-07, "loss": 0.259, "step": 2524 }, { "epoch": 0.72, "grad_norm": 2.187282761189505, "learning_rate": 3.9534240303873e-07, "loss": 0.2573, "step": 2525 }, { "epoch": 0.72, "grad_norm": 2.2560105286323884, "learning_rate": 3.9461165194028854e-07, "loss": 0.2578, "step": 2526 }, { "epoch": 0.72, "grad_norm": 2.2917686683304423, "learning_rate": 3.9388141078289774e-07, "loss": 0.261, "step": 2527 }, { "epoch": 0.72, "grad_norm": 2.4807963920053657, "learning_rate": 3.9315168018166676e-07, "loss": 0.3061, "step": 2528 }, { "epoch": 0.72, "grad_norm": 2.36922156845111, "learning_rate": 3.924224607512753e-07, "loss": 0.2702, "step": 2529 }, { "epoch": 0.72, "grad_norm": 2.3190444175720692, "learning_rate": 3.9169375310597054e-07, "loss": 0.2649, "step": 2530 }, { "epoch": 0.72, "grad_norm": 2.5630193048920256, "learning_rate": 3.909655578595713e-07, "loss": 0.2565, "step": 2531 }, { "epoch": 0.72, "grad_norm": 2.3259427043076393, "learning_rate": 3.9023787562546284e-07, "loss": 0.2595, "step": 2532 }, { "epoch": 0.72, "grad_norm": 2.2907661152770395, "learning_rate": 3.895107070165995e-07, "loss": 0.2744, "step": 2533 }, { "epoch": 0.72, "grad_norm": 2.1958089968134478, "learning_rate": 3.887840526455014e-07, "loss": 0.2606, "step": 2534 }, { "epoch": 0.72, "grad_norm": 2.0772019948377327, "learning_rate": 3.880579131242566e-07, "loss": 0.2445, "step": 2535 }, { "epoch": 0.72, "grad_norm": 2.4373504411076095, "learning_rate": 3.873322890645201e-07, "loss": 0.2693, "step": 2536 }, { "epoch": 0.72, "grad_norm": 2.5830036698453145, "learning_rate": 3.8660718107751176e-07, "loss": 0.2844, "step": 2537 }, { "epoch": 0.72, "grad_norm": 2.484204909542249, "learning_rate": 3.8588258977401636e-07, "loss": 0.2637, "step": 2538 }, { "epoch": 0.72, "grad_norm": 2.3622160811014967, "learning_rate": 3.851585157643844e-07, "loss": 0.2927, "step": 2539 }, { "epoch": 0.72, "grad_norm": 2.3793320717187747, "learning_rate": 3.844349596585298e-07, "loss": 0.2663, "step": 2540 }, { "epoch": 0.72, "grad_norm": 2.361965899246749, "learning_rate": 3.8371192206593174e-07, "loss": 0.2719, "step": 2541 }, { "epoch": 0.72, "grad_norm": 2.384032621212719, "learning_rate": 3.8298940359563057e-07, "loss": 0.2671, "step": 2542 }, { "epoch": 0.72, "grad_norm": 2.6938610355413224, "learning_rate": 3.822674048562309e-07, "loss": 0.2581, "step": 2543 }, { "epoch": 0.72, "grad_norm": 2.3434426912533093, "learning_rate": 3.8154592645589877e-07, "loss": 0.2656, "step": 2544 }, { "epoch": 0.72, "grad_norm": 2.465646994145014, "learning_rate": 3.808249690023624e-07, "loss": 0.2686, "step": 2545 }, { "epoch": 0.72, "grad_norm": 2.3528660319084183, "learning_rate": 3.801045331029108e-07, "loss": 0.2803, "step": 2546 }, { "epoch": 0.72, "grad_norm": 2.3126792038268906, "learning_rate": 3.79384619364394e-07, "loss": 0.2662, "step": 2547 }, { "epoch": 0.72, "grad_norm": 2.381341397933177, "learning_rate": 3.78665228393222e-07, "loss": 0.2654, "step": 2548 }, { "epoch": 0.72, "grad_norm": 2.427083572259114, "learning_rate": 3.7794636079536436e-07, "loss": 0.2747, "step": 2549 }, { "epoch": 0.72, "grad_norm": 2.5762959209498875, "learning_rate": 3.772280171763501e-07, "loss": 0.2803, "step": 2550 }, { "epoch": 0.72, "grad_norm": 2.445020108164999, "learning_rate": 3.765101981412665e-07, "loss": 0.2679, "step": 2551 }, { "epoch": 0.72, "grad_norm": 2.3630588946128537, "learning_rate": 3.757929042947593e-07, "loss": 0.2836, "step": 2552 }, { "epoch": 0.72, "grad_norm": 2.3140210795068104, "learning_rate": 3.7507613624103165e-07, "loss": 0.2908, "step": 2553 }, { "epoch": 0.72, "grad_norm": 2.4066462075449193, "learning_rate": 3.743598945838438e-07, "loss": 0.3071, "step": 2554 }, { "epoch": 0.72, "grad_norm": 2.4619938252669757, "learning_rate": 3.7364417992651266e-07, "loss": 0.2352, "step": 2555 }, { "epoch": 0.72, "grad_norm": 2.3731347309841744, "learning_rate": 3.7292899287191125e-07, "loss": 0.2533, "step": 2556 }, { "epoch": 0.72, "grad_norm": 2.3363844072722357, "learning_rate": 3.7221433402246815e-07, "loss": 0.2865, "step": 2557 }, { "epoch": 0.72, "grad_norm": 2.5048502027566593, "learning_rate": 3.715002039801671e-07, "loss": 0.279, "step": 2558 }, { "epoch": 0.73, "grad_norm": 2.5234443399823583, "learning_rate": 3.707866033465461e-07, "loss": 0.2743, "step": 2559 }, { "epoch": 0.73, "grad_norm": 2.461815368653574, "learning_rate": 3.700735327226976e-07, "loss": 0.2504, "step": 2560 }, { "epoch": 0.73, "grad_norm": 2.4228042349715087, "learning_rate": 3.6936099270926734e-07, "loss": 0.2828, "step": 2561 }, { "epoch": 0.73, "grad_norm": 2.420179888833707, "learning_rate": 3.686489839064543e-07, "loss": 0.2846, "step": 2562 }, { "epoch": 0.73, "grad_norm": 2.4906239765975884, "learning_rate": 3.679375069140099e-07, "loss": 0.309, "step": 2563 }, { "epoch": 0.73, "grad_norm": 2.289665637068872, "learning_rate": 3.6722656233123706e-07, "loss": 0.2536, "step": 2564 }, { "epoch": 0.73, "grad_norm": 2.3349740705759654, "learning_rate": 3.6651615075699137e-07, "loss": 0.2808, "step": 2565 }, { "epoch": 0.73, "grad_norm": 2.4114347304296344, "learning_rate": 3.658062727896788e-07, "loss": 0.2927, "step": 2566 }, { "epoch": 0.73, "grad_norm": 2.5596060525617372, "learning_rate": 3.6509692902725597e-07, "loss": 0.2798, "step": 2567 }, { "epoch": 0.73, "grad_norm": 2.402246854525871, "learning_rate": 3.6438812006722885e-07, "loss": 0.3023, "step": 2568 }, { "epoch": 0.73, "grad_norm": 2.369821125049631, "learning_rate": 3.636798465066536e-07, "loss": 0.2748, "step": 2569 }, { "epoch": 0.73, "grad_norm": 2.2644575106815457, "learning_rate": 3.629721089421359e-07, "loss": 0.2624, "step": 2570 }, { "epoch": 0.73, "grad_norm": 2.2918717960060118, "learning_rate": 3.6226490796982925e-07, "loss": 0.2728, "step": 2571 }, { "epoch": 0.73, "grad_norm": 2.1864303860737917, "learning_rate": 3.615582441854348e-07, "loss": 0.2352, "step": 2572 }, { "epoch": 0.73, "grad_norm": 2.4095803133808533, "learning_rate": 3.6085211818420167e-07, "loss": 0.3267, "step": 2573 }, { "epoch": 0.73, "grad_norm": 2.1223371485411473, "learning_rate": 3.6014653056092593e-07, "loss": 0.2633, "step": 2574 }, { "epoch": 0.73, "grad_norm": 2.207700617626701, "learning_rate": 3.5944148190995073e-07, "loss": 0.2399, "step": 2575 }, { "epoch": 0.73, "grad_norm": 2.1559620057419866, "learning_rate": 3.587369728251647e-07, "loss": 0.2567, "step": 2576 }, { "epoch": 0.73, "grad_norm": 2.403221431935094, "learning_rate": 3.5803300390000133e-07, "loss": 0.246, "step": 2577 }, { "epoch": 0.73, "grad_norm": 2.8409300056102214, "learning_rate": 3.5732957572744e-07, "loss": 0.2835, "step": 2578 }, { "epoch": 0.73, "grad_norm": 2.4152625007808504, "learning_rate": 3.5662668890000415e-07, "loss": 0.3176, "step": 2579 }, { "epoch": 0.73, "grad_norm": 2.307449285104854, "learning_rate": 3.559243440097622e-07, "loss": 0.237, "step": 2580 }, { "epoch": 0.73, "grad_norm": 4.0720001522452485, "learning_rate": 3.5522254164832456e-07, "loss": 0.3037, "step": 2581 }, { "epoch": 0.73, "grad_norm": 4.432606930026203, "learning_rate": 3.5452128240684556e-07, "loss": 0.2782, "step": 2582 }, { "epoch": 0.73, "grad_norm": 2.2833098442405837, "learning_rate": 3.538205668760218e-07, "loss": 0.2605, "step": 2583 }, { "epoch": 0.73, "grad_norm": 2.239400351839413, "learning_rate": 3.53120395646092e-07, "loss": 0.2636, "step": 2584 }, { "epoch": 0.73, "grad_norm": 2.330486004814981, "learning_rate": 3.524207693068364e-07, "loss": 0.284, "step": 2585 }, { "epoch": 0.73, "grad_norm": 2.5464804766981235, "learning_rate": 3.517216884475762e-07, "loss": 0.2748, "step": 2586 }, { "epoch": 0.73, "grad_norm": 2.1444559881500664, "learning_rate": 3.5102315365717303e-07, "loss": 0.2737, "step": 2587 }, { "epoch": 0.73, "grad_norm": 2.167108947304041, "learning_rate": 3.503251655240288e-07, "loss": 0.2714, "step": 2588 }, { "epoch": 0.73, "grad_norm": 2.502123863730012, "learning_rate": 3.4962772463608457e-07, "loss": 0.258, "step": 2589 }, { "epoch": 0.73, "grad_norm": 2.518883768069017, "learning_rate": 3.489308315808209e-07, "loss": 0.2844, "step": 2590 }, { "epoch": 0.73, "grad_norm": 2.340826659734753, "learning_rate": 3.482344869452565e-07, "loss": 0.2684, "step": 2591 }, { "epoch": 0.73, "grad_norm": 2.3794594889726643, "learning_rate": 3.475386913159483e-07, "loss": 0.2825, "step": 2592 }, { "epoch": 0.73, "grad_norm": 2.3529290513987755, "learning_rate": 3.468434452789911e-07, "loss": 0.2599, "step": 2593 }, { "epoch": 0.74, "grad_norm": 2.4524978022353587, "learning_rate": 3.461487494200154e-07, "loss": 0.2631, "step": 2594 }, { "epoch": 0.74, "grad_norm": 2.288189711546269, "learning_rate": 3.4545460432419036e-07, "loss": 0.2626, "step": 2595 }, { "epoch": 0.74, "grad_norm": 2.4783830699080887, "learning_rate": 3.4476101057621966e-07, "loss": 0.3027, "step": 2596 }, { "epoch": 0.74, "grad_norm": 2.3321205098430235, "learning_rate": 3.4406796876034317e-07, "loss": 0.2448, "step": 2597 }, { "epoch": 0.74, "grad_norm": 2.334725211239991, "learning_rate": 3.433754794603355e-07, "loss": 0.2855, "step": 2598 }, { "epoch": 0.74, "grad_norm": 2.259009020940633, "learning_rate": 3.426835432595063e-07, "loss": 0.2452, "step": 2599 }, { "epoch": 0.74, "grad_norm": 2.504960317513778, "learning_rate": 3.4199216074069903e-07, "loss": 0.271, "step": 2600 }, { "epoch": 0.74, "grad_norm": 2.4533948927974816, "learning_rate": 3.4130133248629065e-07, "loss": 0.2929, "step": 2601 }, { "epoch": 0.74, "grad_norm": 2.1717537961063798, "learning_rate": 3.40611059078192e-07, "loss": 0.2325, "step": 2602 }, { "epoch": 0.74, "grad_norm": 2.371460083952491, "learning_rate": 3.399213410978446e-07, "loss": 0.2476, "step": 2603 }, { "epoch": 0.74, "grad_norm": 2.7576954694182683, "learning_rate": 3.392321791262249e-07, "loss": 0.2599, "step": 2604 }, { "epoch": 0.74, "grad_norm": 2.3982745069910436, "learning_rate": 3.3854357374383903e-07, "loss": 0.2699, "step": 2605 }, { "epoch": 0.74, "grad_norm": 2.2094367962380894, "learning_rate": 3.3785552553072517e-07, "loss": 0.2328, "step": 2606 }, { "epoch": 0.74, "grad_norm": 2.304369436989134, "learning_rate": 3.371680350664512e-07, "loss": 0.2697, "step": 2607 }, { "epoch": 0.74, "grad_norm": 2.3650813356848883, "learning_rate": 3.364811029301159e-07, "loss": 0.2831, "step": 2608 }, { "epoch": 0.74, "grad_norm": 2.2515919828201922, "learning_rate": 3.3579472970034814e-07, "loss": 0.2401, "step": 2609 }, { "epoch": 0.74, "grad_norm": 2.2827874062481417, "learning_rate": 3.3510891595530564e-07, "loss": 0.2491, "step": 2610 }, { "epoch": 0.74, "grad_norm": 2.1908715491993274, "learning_rate": 3.3442366227267425e-07, "loss": 0.2541, "step": 2611 }, { "epoch": 0.74, "grad_norm": 2.3717075447613394, "learning_rate": 3.337389692296686e-07, "loss": 0.2761, "step": 2612 }, { "epoch": 0.74, "grad_norm": 2.2872731055219195, "learning_rate": 3.330548374030309e-07, "loss": 0.2577, "step": 2613 }, { "epoch": 0.74, "grad_norm": 2.3441712807359547, "learning_rate": 3.3237126736903166e-07, "loss": 0.2887, "step": 2614 }, { "epoch": 0.74, "grad_norm": 2.490986134566394, "learning_rate": 3.316882597034663e-07, "loss": 0.2656, "step": 2615 }, { "epoch": 0.74, "grad_norm": 2.2382706884208705, "learning_rate": 3.3100581498165783e-07, "loss": 0.2658, "step": 2616 }, { "epoch": 0.74, "grad_norm": 2.324163839691007, "learning_rate": 3.303239337784547e-07, "loss": 0.2477, "step": 2617 }, { "epoch": 0.74, "grad_norm": 2.2028924069980684, "learning_rate": 3.296426166682303e-07, "loss": 0.2943, "step": 2618 }, { "epoch": 0.74, "grad_norm": 2.7641272306318703, "learning_rate": 3.289618642248846e-07, "loss": 0.2767, "step": 2619 }, { "epoch": 0.74, "grad_norm": 2.194964291798905, "learning_rate": 3.282816770218394e-07, "loss": 0.2871, "step": 2620 }, { "epoch": 0.74, "grad_norm": 2.357021323177236, "learning_rate": 3.276020556320419e-07, "loss": 0.2768, "step": 2621 }, { "epoch": 0.74, "grad_norm": 2.654814264416529, "learning_rate": 3.2692300062796254e-07, "loss": 0.3321, "step": 2622 }, { "epoch": 0.74, "grad_norm": 2.29602703091403, "learning_rate": 3.2624451258159447e-07, "loss": 0.2763, "step": 2623 }, { "epoch": 0.74, "grad_norm": 2.4739428126752783, "learning_rate": 3.2556659206445327e-07, "loss": 0.2721, "step": 2624 }, { "epoch": 0.74, "grad_norm": 2.725494800009238, "learning_rate": 3.248892396475765e-07, "loss": 0.2962, "step": 2625 }, { "epoch": 0.74, "grad_norm": 2.2994857273251244, "learning_rate": 3.2421245590152335e-07, "loss": 0.2613, "step": 2626 }, { "epoch": 0.74, "grad_norm": 2.265001633392082, "learning_rate": 3.235362413963738e-07, "loss": 0.2489, "step": 2627 }, { "epoch": 0.74, "grad_norm": 2.565517431965826, "learning_rate": 3.228605967017284e-07, "loss": 0.2866, "step": 2628 }, { "epoch": 0.74, "grad_norm": 3.032018556302911, "learning_rate": 3.221855223867076e-07, "loss": 0.2603, "step": 2629 }, { "epoch": 0.75, "grad_norm": 2.36995929708129, "learning_rate": 3.215110190199518e-07, "loss": 0.275, "step": 2630 }, { "epoch": 0.75, "grad_norm": 2.2582387013579166, "learning_rate": 3.2083708716961986e-07, "loss": 0.2719, "step": 2631 }, { "epoch": 0.75, "grad_norm": 2.6561695353472348, "learning_rate": 3.201637274033899e-07, "loss": 0.2949, "step": 2632 }, { "epoch": 0.75, "grad_norm": 2.1972133812785444, "learning_rate": 3.194909402884576e-07, "loss": 0.262, "step": 2633 }, { "epoch": 0.75, "grad_norm": 2.4304389915393703, "learning_rate": 3.188187263915365e-07, "loss": 0.2677, "step": 2634 }, { "epoch": 0.75, "grad_norm": 2.374947867030814, "learning_rate": 3.181470862788573e-07, "loss": 0.2901, "step": 2635 }, { "epoch": 0.75, "grad_norm": 2.3264780260706974, "learning_rate": 3.174760205161678e-07, "loss": 0.2657, "step": 2636 }, { "epoch": 0.75, "grad_norm": 2.418903400270687, "learning_rate": 3.168055296687305e-07, "loss": 0.282, "step": 2637 }, { "epoch": 0.75, "grad_norm": 2.25516044788765, "learning_rate": 3.161356143013257e-07, "loss": 0.2631, "step": 2638 }, { "epoch": 0.75, "grad_norm": 2.3131577049947527, "learning_rate": 3.154662749782476e-07, "loss": 0.2647, "step": 2639 }, { "epoch": 0.75, "grad_norm": 2.3814425730813724, "learning_rate": 3.1479751226330566e-07, "loss": 0.2769, "step": 2640 }, { "epoch": 0.75, "grad_norm": 2.5373863660884393, "learning_rate": 3.141293267198236e-07, "loss": 0.2953, "step": 2641 }, { "epoch": 0.75, "grad_norm": 2.4174812864003847, "learning_rate": 3.13461718910638e-07, "loss": 0.2943, "step": 2642 }, { "epoch": 0.75, "grad_norm": 2.3206104540840973, "learning_rate": 3.127946893981008e-07, "loss": 0.2671, "step": 2643 }, { "epoch": 0.75, "grad_norm": 3.5344070466388873, "learning_rate": 3.1212823874407513e-07, "loss": 0.3006, "step": 2644 }, { "epoch": 0.75, "grad_norm": 2.344494797925107, "learning_rate": 3.1146236750993757e-07, "loss": 0.2756, "step": 2645 }, { "epoch": 0.75, "grad_norm": 2.250948017574977, "learning_rate": 3.107970762565755e-07, "loss": 0.2626, "step": 2646 }, { "epoch": 0.75, "grad_norm": 2.342448369121599, "learning_rate": 3.1013236554438817e-07, "loss": 0.2823, "step": 2647 }, { "epoch": 0.75, "grad_norm": 2.4801146316539855, "learning_rate": 3.094682359332871e-07, "loss": 0.2701, "step": 2648 }, { "epoch": 0.75, "grad_norm": 2.3519971171584273, "learning_rate": 3.0880468798269286e-07, "loss": 0.245, "step": 2649 }, { "epoch": 0.75, "grad_norm": 2.3023529595295553, "learning_rate": 3.0814172225153623e-07, "loss": 0.2862, "step": 2650 }, { "epoch": 0.75, "grad_norm": 2.479935156267912, "learning_rate": 3.0747933929825786e-07, "loss": 0.2595, "step": 2651 }, { "epoch": 0.75, "grad_norm": 2.2767570811714357, "learning_rate": 3.0681753968080735e-07, "loss": 0.2454, "step": 2652 }, { "epoch": 0.75, "grad_norm": 2.405319864949606, "learning_rate": 3.061563239566439e-07, "loss": 0.2668, "step": 2653 }, { "epoch": 0.75, "grad_norm": 2.3523938041145374, "learning_rate": 3.0549569268273314e-07, "loss": 0.271, "step": 2654 }, { "epoch": 0.75, "grad_norm": 2.2832859994293195, "learning_rate": 3.048356464155495e-07, "loss": 0.2562, "step": 2655 }, { "epoch": 0.75, "grad_norm": 2.1814233292526097, "learning_rate": 3.041761857110744e-07, "loss": 0.2458, "step": 2656 }, { "epoch": 0.75, "grad_norm": 2.3524324132974135, "learning_rate": 3.0351731112479627e-07, "loss": 0.2521, "step": 2657 }, { "epoch": 0.75, "grad_norm": 2.4893308737060544, "learning_rate": 3.0285902321170943e-07, "loss": 0.2848, "step": 2658 }, { "epoch": 0.75, "grad_norm": 2.58810016842196, "learning_rate": 3.0220132252631416e-07, "loss": 0.3069, "step": 2659 }, { "epoch": 0.75, "grad_norm": 2.684462905942372, "learning_rate": 3.015442096226163e-07, "loss": 0.2914, "step": 2660 }, { "epoch": 0.75, "grad_norm": 3.5481577407259155, "learning_rate": 3.008876850541262e-07, "loss": 0.2866, "step": 2661 }, { "epoch": 0.75, "grad_norm": 2.4742375445466247, "learning_rate": 3.00231749373859e-07, "loss": 0.2657, "step": 2662 }, { "epoch": 0.75, "grad_norm": 2.418562491235517, "learning_rate": 2.995764031343336e-07, "loss": 0.269, "step": 2663 }, { "epoch": 0.75, "grad_norm": 2.238527067720981, "learning_rate": 2.989216468875725e-07, "loss": 0.2661, "step": 2664 }, { "epoch": 0.76, "grad_norm": 2.3820690839491427, "learning_rate": 2.9826748118510106e-07, "loss": 0.2402, "step": 2665 }, { "epoch": 0.76, "grad_norm": 2.5059711601939862, "learning_rate": 2.9761390657794727e-07, "loss": 0.2816, "step": 2666 }, { "epoch": 0.76, "grad_norm": 2.336363737866262, "learning_rate": 2.9696092361664125e-07, "loss": 0.2732, "step": 2667 }, { "epoch": 0.76, "grad_norm": 2.2351641737827395, "learning_rate": 2.96308532851215e-07, "loss": 0.2638, "step": 2668 }, { "epoch": 0.76, "grad_norm": 2.455015050921343, "learning_rate": 2.956567348312012e-07, "loss": 0.2741, "step": 2669 }, { "epoch": 0.76, "grad_norm": 2.4782570931707997, "learning_rate": 2.9500553010563356e-07, "loss": 0.2617, "step": 2670 }, { "epoch": 0.76, "grad_norm": 2.412270563323574, "learning_rate": 2.94354919223046e-07, "loss": 0.2446, "step": 2671 }, { "epoch": 0.76, "grad_norm": 2.208344037910064, "learning_rate": 2.9370490273147217e-07, "loss": 0.2516, "step": 2672 }, { "epoch": 0.76, "grad_norm": 2.651043462680491, "learning_rate": 2.9305548117844504e-07, "loss": 0.2722, "step": 2673 }, { "epoch": 0.76, "grad_norm": 2.4735510597437114, "learning_rate": 2.9240665511099636e-07, "loss": 0.2675, "step": 2674 }, { "epoch": 0.76, "grad_norm": 2.258433909290774, "learning_rate": 2.9175842507565695e-07, "loss": 0.2557, "step": 2675 }, { "epoch": 0.76, "grad_norm": 2.408322811827988, "learning_rate": 2.911107916184539e-07, "loss": 0.2982, "step": 2676 }, { "epoch": 0.76, "grad_norm": 2.3516191496170067, "learning_rate": 2.9046375528491376e-07, "loss": 0.2785, "step": 2677 }, { "epoch": 0.76, "grad_norm": 2.2900556855731247, "learning_rate": 2.89817316620059e-07, "loss": 0.2639, "step": 2678 }, { "epoch": 0.76, "grad_norm": 3.0394892227608734, "learning_rate": 2.891714761684093e-07, "loss": 0.272, "step": 2679 }, { "epoch": 0.76, "grad_norm": 2.2794822571116966, "learning_rate": 2.8852623447397915e-07, "loss": 0.248, "step": 2680 }, { "epoch": 0.76, "grad_norm": 2.313592477288886, "learning_rate": 2.8788159208027973e-07, "loss": 0.2649, "step": 2681 }, { "epoch": 0.76, "grad_norm": 2.537623352307677, "learning_rate": 2.8723754953031777e-07, "loss": 0.2874, "step": 2682 }, { "epoch": 0.76, "grad_norm": 2.492465481071438, "learning_rate": 2.8659410736659416e-07, "loss": 0.267, "step": 2683 }, { "epoch": 0.76, "grad_norm": 2.5525190678003082, "learning_rate": 2.8595126613110363e-07, "loss": 0.2814, "step": 2684 }, { "epoch": 0.76, "grad_norm": 2.6229646083995553, "learning_rate": 2.853090263653354e-07, "loss": 0.3226, "step": 2685 }, { "epoch": 0.76, "grad_norm": 2.2844022457053117, "learning_rate": 2.846673886102714e-07, "loss": 0.2898, "step": 2686 }, { "epoch": 0.76, "grad_norm": 2.3647776025257463, "learning_rate": 2.840263534063877e-07, "loss": 0.263, "step": 2687 }, { "epoch": 0.76, "grad_norm": 2.2855963169893183, "learning_rate": 2.833859212936519e-07, "loss": 0.272, "step": 2688 }, { "epoch": 0.76, "grad_norm": 2.765864320378852, "learning_rate": 2.827460928115232e-07, "loss": 0.2951, "step": 2689 }, { "epoch": 0.76, "grad_norm": 2.7069540638044005, "learning_rate": 2.8210686849895307e-07, "loss": 0.3033, "step": 2690 }, { "epoch": 0.76, "grad_norm": 2.353518345754197, "learning_rate": 2.8146824889438356e-07, "loss": 0.3012, "step": 2691 }, { "epoch": 0.76, "grad_norm": 2.2556279108688435, "learning_rate": 2.808302345357486e-07, "loss": 0.2602, "step": 2692 }, { "epoch": 0.76, "grad_norm": 2.515343160953233, "learning_rate": 2.8019282596047046e-07, "loss": 0.2657, "step": 2693 }, { "epoch": 0.76, "grad_norm": 2.5698374203328034, "learning_rate": 2.7955602370546227e-07, "loss": 0.2823, "step": 2694 }, { "epoch": 0.76, "grad_norm": 2.372728140279168, "learning_rate": 2.789198283071261e-07, "loss": 0.2836, "step": 2695 }, { "epoch": 0.76, "grad_norm": 2.528942553797293, "learning_rate": 2.78284240301353e-07, "loss": 0.2802, "step": 2696 }, { "epoch": 0.76, "grad_norm": 2.3999481678317376, "learning_rate": 2.776492602235223e-07, "loss": 0.302, "step": 2697 }, { "epoch": 0.76, "grad_norm": 2.334677905881842, "learning_rate": 2.770148886085013e-07, "loss": 0.259, "step": 2698 }, { "epoch": 0.76, "grad_norm": 2.1839619355909523, "learning_rate": 2.763811259906447e-07, "loss": 0.268, "step": 2699 }, { "epoch": 0.77, "grad_norm": 2.266845888284121, "learning_rate": 2.7574797290379413e-07, "loss": 0.232, "step": 2700 }, { "epoch": 0.77, "grad_norm": 2.480842434672786, "learning_rate": 2.751154298812781e-07, "loss": 0.2678, "step": 2701 }, { "epoch": 0.77, "grad_norm": 2.2677658357528414, "learning_rate": 2.74483497455911e-07, "loss": 0.2665, "step": 2702 }, { "epoch": 0.77, "grad_norm": 2.4011260292496845, "learning_rate": 2.73852176159993e-07, "loss": 0.2534, "step": 2703 }, { "epoch": 0.77, "grad_norm": 2.253785637191656, "learning_rate": 2.732214665253092e-07, "loss": 0.2379, "step": 2704 }, { "epoch": 0.77, "grad_norm": 2.602202388621679, "learning_rate": 2.7259136908312995e-07, "loss": 0.3228, "step": 2705 }, { "epoch": 0.77, "grad_norm": 2.3452687489865895, "learning_rate": 2.719618843642095e-07, "loss": 0.2966, "step": 2706 }, { "epoch": 0.77, "grad_norm": 2.184038611317654, "learning_rate": 2.713330128987864e-07, "loss": 0.2524, "step": 2707 }, { "epoch": 0.77, "grad_norm": 2.3034671990517026, "learning_rate": 2.707047552165822e-07, "loss": 0.2575, "step": 2708 }, { "epoch": 0.77, "grad_norm": 2.405072030453159, "learning_rate": 2.700771118468017e-07, "loss": 0.2868, "step": 2709 }, { "epoch": 0.77, "grad_norm": 2.252834431948804, "learning_rate": 2.6945008331813224e-07, "loss": 0.2549, "step": 2710 }, { "epoch": 0.77, "grad_norm": 2.6519575726519546, "learning_rate": 2.688236701587431e-07, "loss": 0.3061, "step": 2711 }, { "epoch": 0.77, "grad_norm": 2.3694153979510038, "learning_rate": 2.6819787289628526e-07, "loss": 0.2777, "step": 2712 }, { "epoch": 0.77, "grad_norm": 2.5804112331460813, "learning_rate": 2.6757269205789113e-07, "loss": 0.2905, "step": 2713 }, { "epoch": 0.77, "grad_norm": 2.3565782140319422, "learning_rate": 2.6694812817017387e-07, "loss": 0.267, "step": 2714 }, { "epoch": 0.77, "grad_norm": 2.3170272128324116, "learning_rate": 2.663241817592261e-07, "loss": 0.2478, "step": 2715 }, { "epoch": 0.77, "grad_norm": 2.3698915573154906, "learning_rate": 2.6570085335062164e-07, "loss": 0.2489, "step": 2716 }, { "epoch": 0.77, "grad_norm": 2.2915645771544253, "learning_rate": 2.6507814346941293e-07, "loss": 0.2934, "step": 2717 }, { "epoch": 0.77, "grad_norm": 2.268037849281149, "learning_rate": 2.64456052640132e-07, "loss": 0.2947, "step": 2718 }, { "epoch": 0.77, "grad_norm": 2.402245114365464, "learning_rate": 2.6383458138678827e-07, "loss": 0.2441, "step": 2719 }, { "epoch": 0.77, "grad_norm": 2.433258683555696, "learning_rate": 2.6321373023287007e-07, "loss": 0.2361, "step": 2720 }, { "epoch": 0.77, "grad_norm": 2.638457709606131, "learning_rate": 2.6259349970134403e-07, "loss": 0.2723, "step": 2721 }, { "epoch": 0.77, "grad_norm": 2.1506166647148444, "learning_rate": 2.6197389031465324e-07, "loss": 0.2288, "step": 2722 }, { "epoch": 0.77, "grad_norm": 2.2800718518628473, "learning_rate": 2.613549025947169e-07, "loss": 0.2345, "step": 2723 }, { "epoch": 0.77, "grad_norm": 2.2215835594373874, "learning_rate": 2.60736537062932e-07, "loss": 0.245, "step": 2724 }, { "epoch": 0.77, "grad_norm": 2.4457713208187712, "learning_rate": 2.6011879424017005e-07, "loss": 0.3009, "step": 2725 }, { "epoch": 0.77, "grad_norm": 2.3109213740501753, "learning_rate": 2.5950167464677985e-07, "loss": 0.2648, "step": 2726 }, { "epoch": 0.77, "grad_norm": 2.367702983358573, "learning_rate": 2.588851788025832e-07, "loss": 0.2656, "step": 2727 }, { "epoch": 0.77, "grad_norm": 2.267989855733225, "learning_rate": 2.582693072268778e-07, "loss": 0.2742, "step": 2728 }, { "epoch": 0.77, "grad_norm": 2.3799453877005488, "learning_rate": 2.5765406043843483e-07, "loss": 0.2879, "step": 2729 }, { "epoch": 0.77, "grad_norm": 2.4183363741864863, "learning_rate": 2.5703943895549975e-07, "loss": 0.2452, "step": 2730 }, { "epoch": 0.77, "grad_norm": 2.352119842346004, "learning_rate": 2.5642544329579085e-07, "loss": 0.2769, "step": 2731 }, { "epoch": 0.77, "grad_norm": 2.3630801759639217, "learning_rate": 2.558120739764995e-07, "loss": 0.2806, "step": 2732 }, { "epoch": 0.77, "grad_norm": 2.511379597161267, "learning_rate": 2.551993315142894e-07, "loss": 0.2976, "step": 2733 }, { "epoch": 0.77, "grad_norm": 2.1875569102599113, "learning_rate": 2.5458721642529637e-07, "loss": 0.2228, "step": 2734 }, { "epoch": 0.77, "grad_norm": 2.3201124027880544, "learning_rate": 2.5397572922512735e-07, "loss": 0.2644, "step": 2735 }, { "epoch": 0.78, "grad_norm": 2.5622681136534515, "learning_rate": 2.53364870428861e-07, "loss": 0.2763, "step": 2736 }, { "epoch": 0.78, "grad_norm": 2.650004763755733, "learning_rate": 2.527546405510461e-07, "loss": 0.2547, "step": 2737 }, { "epoch": 0.78, "grad_norm": 2.814911905875072, "learning_rate": 2.5214504010570214e-07, "loss": 0.2653, "step": 2738 }, { "epoch": 0.78, "grad_norm": 2.2147385657579566, "learning_rate": 2.515360696063179e-07, "loss": 0.245, "step": 2739 }, { "epoch": 0.78, "grad_norm": 2.306210277214575, "learning_rate": 2.5092772956585205e-07, "loss": 0.269, "step": 2740 }, { "epoch": 0.78, "grad_norm": 2.239930705597454, "learning_rate": 2.503200204967317e-07, "loss": 0.2716, "step": 2741 }, { "epoch": 0.78, "grad_norm": 2.2169610526187618, "learning_rate": 2.497129429108531e-07, "loss": 0.2659, "step": 2742 }, { "epoch": 0.78, "grad_norm": 5.314771526733961, "learning_rate": 2.491064973195798e-07, "loss": 0.2949, "step": 2743 }, { "epoch": 0.78, "grad_norm": 2.464722741180483, "learning_rate": 2.485006842337437e-07, "loss": 0.2622, "step": 2744 }, { "epoch": 0.78, "grad_norm": 2.4414530378845556, "learning_rate": 2.4789550416364347e-07, "loss": 0.309, "step": 2745 }, { "epoch": 0.78, "grad_norm": 2.3966166894823697, "learning_rate": 2.4729095761904483e-07, "loss": 0.2892, "step": 2746 }, { "epoch": 0.78, "grad_norm": 2.293602180410425, "learning_rate": 2.466870451091796e-07, "loss": 0.2568, "step": 2747 }, { "epoch": 0.78, "grad_norm": 2.570710790990384, "learning_rate": 2.4608376714274617e-07, "loss": 0.2488, "step": 2748 }, { "epoch": 0.78, "grad_norm": 2.384964366245312, "learning_rate": 2.454811242279069e-07, "loss": 0.2628, "step": 2749 }, { "epoch": 0.78, "grad_norm": 2.410683574063837, "learning_rate": 2.4487911687229113e-07, "loss": 0.2534, "step": 2750 }, { "epoch": 0.78, "grad_norm": 2.3339434682085076, "learning_rate": 2.4427774558299185e-07, "loss": 0.2967, "step": 2751 }, { "epoch": 0.78, "grad_norm": 2.2911955493857046, "learning_rate": 2.4367701086656624e-07, "loss": 0.2943, "step": 2752 }, { "epoch": 0.78, "grad_norm": 2.3451456650946283, "learning_rate": 2.430769132290357e-07, "loss": 0.2765, "step": 2753 }, { "epoch": 0.78, "grad_norm": 2.7071698183330892, "learning_rate": 2.4247745317588397e-07, "loss": 0.3126, "step": 2754 }, { "epoch": 0.78, "grad_norm": 2.3761776008143247, "learning_rate": 2.418786312120593e-07, "loss": 0.2556, "step": 2755 }, { "epoch": 0.78, "grad_norm": 2.188824650286055, "learning_rate": 2.412804478419712e-07, "loss": 0.2809, "step": 2756 }, { "epoch": 0.78, "grad_norm": 2.3791611004508746, "learning_rate": 2.406829035694923e-07, "loss": 0.2718, "step": 2757 }, { "epoch": 0.78, "grad_norm": 2.3031708128532293, "learning_rate": 2.400859988979554e-07, "loss": 0.2666, "step": 2758 }, { "epoch": 0.78, "grad_norm": 2.507943026823091, "learning_rate": 2.394897343301556e-07, "loss": 0.2832, "step": 2759 }, { "epoch": 0.78, "grad_norm": 2.1908843560087625, "learning_rate": 2.388941103683493e-07, "loss": 0.242, "step": 2760 }, { "epoch": 0.78, "grad_norm": 2.3430749497591057, "learning_rate": 2.382991275142524e-07, "loss": 0.2607, "step": 2761 }, { "epoch": 0.78, "grad_norm": 2.4206029000098783, "learning_rate": 2.3770478626904068e-07, "loss": 0.2676, "step": 2762 }, { "epoch": 0.78, "grad_norm": 2.340638141700092, "learning_rate": 2.3711108713334994e-07, "loss": 0.2575, "step": 2763 }, { "epoch": 0.78, "grad_norm": 2.398508636026643, "learning_rate": 2.3651803060727482e-07, "loss": 0.2855, "step": 2764 }, { "epoch": 0.78, "grad_norm": 2.4459378713673647, "learning_rate": 2.3592561719036952e-07, "loss": 0.2749, "step": 2765 }, { "epoch": 0.78, "grad_norm": 2.423230704899453, "learning_rate": 2.3533384738164508e-07, "loss": 0.2519, "step": 2766 }, { "epoch": 0.78, "grad_norm": 3.0438979344407984, "learning_rate": 2.3474272167957143e-07, "loss": 0.2805, "step": 2767 }, { "epoch": 0.78, "grad_norm": 2.3500512947448704, "learning_rate": 2.341522405820756e-07, "loss": 0.2851, "step": 2768 }, { "epoch": 0.78, "grad_norm": 2.3808000861680303, "learning_rate": 2.3356240458654185e-07, "loss": 0.2639, "step": 2769 }, { "epoch": 0.78, "grad_norm": 2.5585487785990346, "learning_rate": 2.3297321418981075e-07, "loss": 0.2867, "step": 2770 }, { "epoch": 0.79, "grad_norm": 2.0898916437412027, "learning_rate": 2.3238466988817928e-07, "loss": 0.2489, "step": 2771 }, { "epoch": 0.79, "grad_norm": 2.410983630793635, "learning_rate": 2.3179677217740013e-07, "loss": 0.2751, "step": 2772 }, { "epoch": 0.79, "grad_norm": 2.2514253584456005, "learning_rate": 2.3120952155268137e-07, "loss": 0.2576, "step": 2773 }, { "epoch": 0.79, "grad_norm": 2.4578035921751096, "learning_rate": 2.3062291850868588e-07, "loss": 0.2676, "step": 2774 }, { "epoch": 0.79, "grad_norm": 2.185950464556516, "learning_rate": 2.3003696353953117e-07, "loss": 0.2439, "step": 2775 }, { "epoch": 0.79, "grad_norm": 2.202339342263721, "learning_rate": 2.29451657138789e-07, "loss": 0.2578, "step": 2776 }, { "epoch": 0.79, "grad_norm": 2.3231308517243865, "learning_rate": 2.2886699979948444e-07, "loss": 0.2893, "step": 2777 }, { "epoch": 0.79, "grad_norm": 2.235137483563809, "learning_rate": 2.2828299201409617e-07, "loss": 0.2766, "step": 2778 }, { "epoch": 0.79, "grad_norm": 2.194310608489236, "learning_rate": 2.2769963427455552e-07, "loss": 0.2535, "step": 2779 }, { "epoch": 0.79, "grad_norm": 2.372712133821499, "learning_rate": 2.2711692707224639e-07, "loss": 0.2858, "step": 2780 }, { "epoch": 0.79, "grad_norm": 2.214835367155243, "learning_rate": 2.265348708980046e-07, "loss": 0.2615, "step": 2781 }, { "epoch": 0.79, "grad_norm": 2.6208064832788542, "learning_rate": 2.2595346624211786e-07, "loss": 0.2892, "step": 2782 }, { "epoch": 0.79, "grad_norm": 2.4452656131894877, "learning_rate": 2.2537271359432454e-07, "loss": 0.2791, "step": 2783 }, { "epoch": 0.79, "grad_norm": 2.400722142292888, "learning_rate": 2.247926134438144e-07, "loss": 0.2646, "step": 2784 }, { "epoch": 0.79, "grad_norm": 2.3866676906776614, "learning_rate": 2.2421316627922715e-07, "loss": 0.2674, "step": 2785 }, { "epoch": 0.79, "grad_norm": 2.4233247286668496, "learning_rate": 2.236343725886527e-07, "loss": 0.2808, "step": 2786 }, { "epoch": 0.79, "grad_norm": 2.291558351108696, "learning_rate": 2.230562328596306e-07, "loss": 0.2673, "step": 2787 }, { "epoch": 0.79, "grad_norm": 2.343505929479789, "learning_rate": 2.2247874757914864e-07, "loss": 0.267, "step": 2788 }, { "epoch": 0.79, "grad_norm": 2.189849698105635, "learning_rate": 2.2190191723364492e-07, "loss": 0.2542, "step": 2789 }, { "epoch": 0.79, "grad_norm": 2.421695068222722, "learning_rate": 2.2132574230900482e-07, "loss": 0.2883, "step": 2790 }, { "epoch": 0.79, "grad_norm": 2.3300957788556116, "learning_rate": 2.2075022329056192e-07, "loss": 0.2993, "step": 2791 }, { "epoch": 0.79, "grad_norm": 2.383095763086712, "learning_rate": 2.2017536066309684e-07, "loss": 0.2906, "step": 2792 }, { "epoch": 0.79, "grad_norm": 2.1953601942491043, "learning_rate": 2.1960115491083752e-07, "loss": 0.237, "step": 2793 }, { "epoch": 0.79, "grad_norm": 2.307913404713227, "learning_rate": 2.1902760651745954e-07, "loss": 0.2765, "step": 2794 }, { "epoch": 0.79, "grad_norm": 2.4715336117778723, "learning_rate": 2.1845471596608378e-07, "loss": 0.278, "step": 2795 }, { "epoch": 0.79, "grad_norm": 2.2935559162523127, "learning_rate": 2.1788248373927675e-07, "loss": 0.2311, "step": 2796 }, { "epoch": 0.79, "grad_norm": 2.9425745468248645, "learning_rate": 2.1731091031905113e-07, "loss": 0.2453, "step": 2797 }, { "epoch": 0.79, "grad_norm": 2.2150854366589776, "learning_rate": 2.16739996186864e-07, "loss": 0.2515, "step": 2798 }, { "epoch": 0.79, "grad_norm": 2.353730791125616, "learning_rate": 2.1616974182361825e-07, "loss": 0.2687, "step": 2799 }, { "epoch": 0.79, "grad_norm": 2.346278524063355, "learning_rate": 2.1560014770966006e-07, "loss": 0.264, "step": 2800 }, { "epoch": 0.79, "grad_norm": 2.3421743418395375, "learning_rate": 2.1503121432477932e-07, "loss": 0.2769, "step": 2801 }, { "epoch": 0.79, "grad_norm": 2.362493254521407, "learning_rate": 2.1446294214820991e-07, "loss": 0.244, "step": 2802 }, { "epoch": 0.79, "grad_norm": 2.390546907904252, "learning_rate": 2.1389533165862826e-07, "loss": 0.2489, "step": 2803 }, { "epoch": 0.79, "grad_norm": 2.298123026178687, "learning_rate": 2.1332838333415447e-07, "loss": 0.2389, "step": 2804 }, { "epoch": 0.79, "grad_norm": 2.22722586200972, "learning_rate": 2.1276209765234954e-07, "loss": 0.2404, "step": 2805 }, { "epoch": 0.8, "grad_norm": 2.399010284462047, "learning_rate": 2.1219647509021698e-07, "loss": 0.2709, "step": 2806 }, { "epoch": 0.8, "grad_norm": 2.434832098004782, "learning_rate": 2.116315161242015e-07, "loss": 0.2922, "step": 2807 }, { "epoch": 0.8, "grad_norm": 2.227811226214471, "learning_rate": 2.110672212301896e-07, "loss": 0.2842, "step": 2808 }, { "epoch": 0.8, "grad_norm": 2.2952246407389745, "learning_rate": 2.1050359088350723e-07, "loss": 0.2774, "step": 2809 }, { "epoch": 0.8, "grad_norm": 2.50504305057805, "learning_rate": 2.0994062555892123e-07, "loss": 0.2699, "step": 2810 }, { "epoch": 0.8, "grad_norm": 2.342581244971091, "learning_rate": 2.0937832573063818e-07, "loss": 0.2346, "step": 2811 }, { "epoch": 0.8, "grad_norm": 2.2551191727768205, "learning_rate": 2.088166918723041e-07, "loss": 0.2472, "step": 2812 }, { "epoch": 0.8, "grad_norm": 2.2456476952574276, "learning_rate": 2.0825572445700401e-07, "loss": 0.2725, "step": 2813 }, { "epoch": 0.8, "grad_norm": 2.3033253017776203, "learning_rate": 2.076954239572616e-07, "loss": 0.2669, "step": 2814 }, { "epoch": 0.8, "grad_norm": 2.388877308526638, "learning_rate": 2.0713579084503873e-07, "loss": 0.2724, "step": 2815 }, { "epoch": 0.8, "grad_norm": 2.3343989502499274, "learning_rate": 2.0657682559173506e-07, "loss": 0.2987, "step": 2816 }, { "epoch": 0.8, "grad_norm": 2.2413289141575885, "learning_rate": 2.060185286681878e-07, "loss": 0.2364, "step": 2817 }, { "epoch": 0.8, "grad_norm": 2.520433126733271, "learning_rate": 2.0546090054467114e-07, "loss": 0.2692, "step": 2818 }, { "epoch": 0.8, "grad_norm": 2.4097024486795804, "learning_rate": 2.0490394169089597e-07, "loss": 0.2538, "step": 2819 }, { "epoch": 0.8, "grad_norm": 2.243358516588946, "learning_rate": 2.0434765257600928e-07, "loss": 0.2506, "step": 2820 }, { "epoch": 0.8, "grad_norm": 2.5078519127537215, "learning_rate": 2.037920336685941e-07, "loss": 0.2922, "step": 2821 }, { "epoch": 0.8, "grad_norm": 2.4605547304027033, "learning_rate": 2.0323708543666883e-07, "loss": 0.2951, "step": 2822 }, { "epoch": 0.8, "grad_norm": 2.293726838612549, "learning_rate": 2.0268280834768692e-07, "loss": 0.2494, "step": 2823 }, { "epoch": 0.8, "grad_norm": 2.4860982798648648, "learning_rate": 2.021292028685365e-07, "loss": 0.283, "step": 2824 }, { "epoch": 0.8, "grad_norm": 2.2593332441880984, "learning_rate": 2.0157626946553995e-07, "loss": 0.2997, "step": 2825 }, { "epoch": 0.8, "grad_norm": 2.445996744842272, "learning_rate": 2.01024008604454e-07, "loss": 0.2568, "step": 2826 }, { "epoch": 0.8, "grad_norm": 2.1971138861459263, "learning_rate": 2.0047242075046744e-07, "loss": 0.246, "step": 2827 }, { "epoch": 0.8, "grad_norm": 2.592379392720621, "learning_rate": 1.9992150636820415e-07, "loss": 0.2868, "step": 2828 }, { "epoch": 0.8, "grad_norm": 2.4169149111598256, "learning_rate": 1.993712659217194e-07, "loss": 0.2714, "step": 2829 }, { "epoch": 0.8, "grad_norm": 2.4872947332284334, "learning_rate": 1.9882169987450138e-07, "loss": 0.273, "step": 2830 }, { "epoch": 0.8, "grad_norm": 2.483937778500142, "learning_rate": 1.982728086894694e-07, "loss": 0.2737, "step": 2831 }, { "epoch": 0.8, "grad_norm": 2.1706509090641597, "learning_rate": 1.977245928289748e-07, "loss": 0.2529, "step": 2832 }, { "epoch": 0.8, "grad_norm": 4.210854981564041, "learning_rate": 1.971770527548008e-07, "loss": 0.2585, "step": 2833 }, { "epoch": 0.8, "grad_norm": 2.67382194381519, "learning_rate": 1.9663018892816063e-07, "loss": 0.2802, "step": 2834 }, { "epoch": 0.8, "grad_norm": 2.2123984409162762, "learning_rate": 1.9608400180969743e-07, "loss": 0.2357, "step": 2835 }, { "epoch": 0.8, "grad_norm": 2.5011636033003968, "learning_rate": 1.9553849185948512e-07, "loss": 0.2659, "step": 2836 }, { "epoch": 0.8, "grad_norm": 2.3650456015272416, "learning_rate": 1.9499365953702674e-07, "loss": 0.3112, "step": 2837 }, { "epoch": 0.8, "grad_norm": 2.3662359880050072, "learning_rate": 1.9444950530125548e-07, "loss": 0.2701, "step": 2838 }, { "epoch": 0.8, "grad_norm": 2.3835584666707947, "learning_rate": 1.9390602961053194e-07, "loss": 0.2649, "step": 2839 }, { "epoch": 0.8, "grad_norm": 2.596380722618807, "learning_rate": 1.933632329226459e-07, "loss": 0.286, "step": 2840 }, { "epoch": 0.8, "grad_norm": 2.7967913473061157, "learning_rate": 1.9282111569481506e-07, "loss": 0.2663, "step": 2841 }, { "epoch": 0.81, "grad_norm": 2.2110276623027456, "learning_rate": 1.9227967838368564e-07, "loss": 0.2591, "step": 2842 }, { "epoch": 0.81, "grad_norm": 2.249417596052053, "learning_rate": 1.9173892144532956e-07, "loss": 0.2357, "step": 2843 }, { "epoch": 0.81, "grad_norm": 4.700061186446766, "learning_rate": 1.9119884533524665e-07, "loss": 0.2586, "step": 2844 }, { "epoch": 0.81, "grad_norm": 3.0079039657582887, "learning_rate": 1.9065945050836297e-07, "loss": 0.2734, "step": 2845 }, { "epoch": 0.81, "grad_norm": 2.594893242581233, "learning_rate": 1.9012073741903068e-07, "loss": 0.2745, "step": 2846 }, { "epoch": 0.81, "grad_norm": 2.4748954516830683, "learning_rate": 1.8958270652102858e-07, "loss": 0.2767, "step": 2847 }, { "epoch": 0.81, "grad_norm": 2.5129604741794878, "learning_rate": 1.8904535826755908e-07, "loss": 0.2863, "step": 2848 }, { "epoch": 0.81, "grad_norm": 2.5981441940170984, "learning_rate": 1.8850869311125096e-07, "loss": 0.2985, "step": 2849 }, { "epoch": 0.81, "grad_norm": 2.3356548455202435, "learning_rate": 1.8797271150415705e-07, "loss": 0.2584, "step": 2850 }, { "epoch": 0.81, "grad_norm": 3.1820448800506997, "learning_rate": 1.8743741389775469e-07, "loss": 0.2552, "step": 2851 }, { "epoch": 0.81, "grad_norm": 2.23023753403099, "learning_rate": 1.8690280074294473e-07, "loss": 0.2587, "step": 2852 }, { "epoch": 0.81, "grad_norm": 2.5679736393340393, "learning_rate": 1.8636887249005174e-07, "loss": 0.2795, "step": 2853 }, { "epoch": 0.81, "grad_norm": 2.177529079557188, "learning_rate": 1.8583562958882327e-07, "loss": 0.2363, "step": 2854 }, { "epoch": 0.81, "grad_norm": 2.4742523821737907, "learning_rate": 1.853030724884297e-07, "loss": 0.3092, "step": 2855 }, { "epoch": 0.81, "grad_norm": 2.2885878314952874, "learning_rate": 1.847712016374634e-07, "loss": 0.2582, "step": 2856 }, { "epoch": 0.81, "grad_norm": 2.74968714414436, "learning_rate": 1.8424001748393904e-07, "loss": 0.2926, "step": 2857 }, { "epoch": 0.81, "grad_norm": 2.2223393631269373, "learning_rate": 1.8370952047529263e-07, "loss": 0.2478, "step": 2858 }, { "epoch": 0.81, "grad_norm": 2.2864500433093164, "learning_rate": 1.831797110583817e-07, "loss": 0.2364, "step": 2859 }, { "epoch": 0.81, "grad_norm": 2.4266795093411857, "learning_rate": 1.8265058967948433e-07, "loss": 0.2833, "step": 2860 }, { "epoch": 0.81, "grad_norm": 2.609719240431143, "learning_rate": 1.8212215678429854e-07, "loss": 0.2891, "step": 2861 }, { "epoch": 0.81, "grad_norm": 2.4925100660920267, "learning_rate": 1.8159441281794352e-07, "loss": 0.2934, "step": 2862 }, { "epoch": 0.81, "grad_norm": 2.5036731108979406, "learning_rate": 1.8106735822495744e-07, "loss": 0.2703, "step": 2863 }, { "epoch": 0.81, "grad_norm": 2.302888816291254, "learning_rate": 1.805409934492983e-07, "loss": 0.2605, "step": 2864 }, { "epoch": 0.81, "grad_norm": 2.3070765097071737, "learning_rate": 1.8001531893434185e-07, "loss": 0.2479, "step": 2865 }, { "epoch": 0.81, "grad_norm": 2.511833666867204, "learning_rate": 1.7949033512288346e-07, "loss": 0.2459, "step": 2866 }, { "epoch": 0.81, "grad_norm": 3.8799279280843644, "learning_rate": 1.7896604245713686e-07, "loss": 0.2613, "step": 2867 }, { "epoch": 0.81, "grad_norm": 2.144326368427733, "learning_rate": 1.7844244137873298e-07, "loss": 0.2169, "step": 2868 }, { "epoch": 0.81, "grad_norm": 2.3584488923021802, "learning_rate": 1.779195323287208e-07, "loss": 0.2616, "step": 2869 }, { "epoch": 0.81, "grad_norm": 2.705678283702525, "learning_rate": 1.7739731574756522e-07, "loss": 0.2795, "step": 2870 }, { "epoch": 0.81, "grad_norm": 2.2011956923784397, "learning_rate": 1.768757920751489e-07, "loss": 0.2416, "step": 2871 }, { "epoch": 0.81, "grad_norm": 2.2331302113818396, "learning_rate": 1.7635496175077081e-07, "loss": 0.265, "step": 2872 }, { "epoch": 0.81, "grad_norm": 2.3523724278598275, "learning_rate": 1.7583482521314595e-07, "loss": 0.2821, "step": 2873 }, { "epoch": 0.81, "grad_norm": 3.9974888016714254, "learning_rate": 1.7531538290040382e-07, "loss": 0.2884, "step": 2874 }, { "epoch": 0.81, "grad_norm": 2.55439064748466, "learning_rate": 1.7479663525009037e-07, "loss": 0.2989, "step": 2875 }, { "epoch": 0.81, "grad_norm": 2.3600553815923035, "learning_rate": 1.7427858269916563e-07, "loss": 0.259, "step": 2876 }, { "epoch": 0.82, "grad_norm": 2.3988600611863737, "learning_rate": 1.737612256840053e-07, "loss": 0.2789, "step": 2877 }, { "epoch": 0.82, "grad_norm": 2.3773615435211157, "learning_rate": 1.732445646403975e-07, "loss": 0.2787, "step": 2878 }, { "epoch": 0.82, "grad_norm": 2.467114711446906, "learning_rate": 1.7272860000354538e-07, "loss": 0.27, "step": 2879 }, { "epoch": 0.82, "grad_norm": 2.227839666993782, "learning_rate": 1.7221333220806477e-07, "loss": 0.2531, "step": 2880 }, { "epoch": 0.82, "grad_norm": 2.3135070130291506, "learning_rate": 1.7169876168798558e-07, "loss": 0.2465, "step": 2881 }, { "epoch": 0.82, "grad_norm": 2.539791939043467, "learning_rate": 1.7118488887674887e-07, "loss": 0.2816, "step": 2882 }, { "epoch": 0.82, "grad_norm": 2.6893138314421017, "learning_rate": 1.7067171420720904e-07, "loss": 0.3075, "step": 2883 }, { "epoch": 0.82, "grad_norm": 2.2844568548404656, "learning_rate": 1.7015923811163224e-07, "loss": 0.2549, "step": 2884 }, { "epoch": 0.82, "grad_norm": 2.5417677784666646, "learning_rate": 1.696474610216958e-07, "loss": 0.3035, "step": 2885 }, { "epoch": 0.82, "grad_norm": 2.283990586546514, "learning_rate": 1.691363833684889e-07, "loss": 0.2711, "step": 2886 }, { "epoch": 0.82, "grad_norm": 2.4459137162379867, "learning_rate": 1.6862600558251095e-07, "loss": 0.2835, "step": 2887 }, { "epoch": 0.82, "grad_norm": 2.3833279688766233, "learning_rate": 1.6811632809367204e-07, "loss": 0.2697, "step": 2888 }, { "epoch": 0.82, "grad_norm": 2.5303782470283664, "learning_rate": 1.6760735133129267e-07, "loss": 0.268, "step": 2889 }, { "epoch": 0.82, "grad_norm": 2.232816947627661, "learning_rate": 1.6709907572410265e-07, "loss": 0.263, "step": 2890 }, { "epoch": 0.82, "grad_norm": 2.296004011835285, "learning_rate": 1.665915017002414e-07, "loss": 0.246, "step": 2891 }, { "epoch": 0.82, "grad_norm": 2.339328657097091, "learning_rate": 1.6608462968725733e-07, "loss": 0.2813, "step": 2892 }, { "epoch": 0.82, "grad_norm": 2.188178965593008, "learning_rate": 1.6557846011210751e-07, "loss": 0.2206, "step": 2893 }, { "epoch": 0.82, "grad_norm": 2.5645938693793346, "learning_rate": 1.6507299340115744e-07, "loss": 0.2885, "step": 2894 }, { "epoch": 0.82, "grad_norm": 2.339243501741995, "learning_rate": 1.645682299801804e-07, "loss": 0.2825, "step": 2895 }, { "epoch": 0.82, "grad_norm": 2.367728617536032, "learning_rate": 1.6406417027435727e-07, "loss": 0.2447, "step": 2896 }, { "epoch": 0.82, "grad_norm": 2.462794940200196, "learning_rate": 1.6356081470827633e-07, "loss": 0.2586, "step": 2897 }, { "epoch": 0.82, "grad_norm": 2.9373717395321797, "learning_rate": 1.6305816370593262e-07, "loss": 0.2825, "step": 2898 }, { "epoch": 0.82, "grad_norm": 2.2061403889425364, "learning_rate": 1.6255621769072803e-07, "loss": 0.2256, "step": 2899 }, { "epoch": 0.82, "grad_norm": 2.255180702409096, "learning_rate": 1.6205497708546933e-07, "loss": 0.2343, "step": 2900 }, { "epoch": 0.82, "grad_norm": 2.326546351934489, "learning_rate": 1.6155444231237104e-07, "loss": 0.2713, "step": 2901 }, { "epoch": 0.82, "grad_norm": 2.9170467115769405, "learning_rate": 1.6105461379305186e-07, "loss": 0.2874, "step": 2902 }, { "epoch": 0.82, "grad_norm": 2.400555296130694, "learning_rate": 1.60555491948536e-07, "loss": 0.255, "step": 2903 }, { "epoch": 0.82, "grad_norm": 2.0770253089306814, "learning_rate": 1.6005707719925188e-07, "loss": 0.2383, "step": 2904 }, { "epoch": 0.82, "grad_norm": 2.264987088165909, "learning_rate": 1.5955936996503284e-07, "loss": 0.2391, "step": 2905 }, { "epoch": 0.82, "grad_norm": 2.337389217903713, "learning_rate": 1.590623706651164e-07, "loss": 0.2457, "step": 2906 }, { "epoch": 0.82, "grad_norm": 2.2921471576670096, "learning_rate": 1.5856607971814374e-07, "loss": 0.2613, "step": 2907 }, { "epoch": 0.82, "grad_norm": 2.301923660369795, "learning_rate": 1.580704975421584e-07, "loss": 0.2399, "step": 2908 }, { "epoch": 0.82, "grad_norm": 2.4213038071881754, "learning_rate": 1.5757562455460805e-07, "loss": 0.244, "step": 2909 }, { "epoch": 0.82, "grad_norm": 2.5169986268703934, "learning_rate": 1.5708146117234223e-07, "loss": 0.2921, "step": 2910 }, { "epoch": 0.82, "grad_norm": 2.4131508151166474, "learning_rate": 1.5658800781161363e-07, "loss": 0.2583, "step": 2911 }, { "epoch": 0.83, "grad_norm": 2.3094984954712166, "learning_rate": 1.5609526488807611e-07, "loss": 0.2428, "step": 2912 }, { "epoch": 0.83, "grad_norm": 2.297686613691211, "learning_rate": 1.5560323281678512e-07, "loss": 0.2762, "step": 2913 }, { "epoch": 0.83, "grad_norm": 2.401738290881385, "learning_rate": 1.5511191201219732e-07, "loss": 0.2604, "step": 2914 }, { "epoch": 0.83, "grad_norm": 2.2765127696756253, "learning_rate": 1.5462130288817088e-07, "loss": 0.274, "step": 2915 }, { "epoch": 0.83, "grad_norm": 2.2840651600147557, "learning_rate": 1.5413140585796426e-07, "loss": 0.2799, "step": 2916 }, { "epoch": 0.83, "grad_norm": 2.3612164284457617, "learning_rate": 1.536422213342352e-07, "loss": 0.2449, "step": 2917 }, { "epoch": 0.83, "grad_norm": 2.2899490356795273, "learning_rate": 1.5315374972904238e-07, "loss": 0.2416, "step": 2918 }, { "epoch": 0.83, "grad_norm": 2.4698956489551285, "learning_rate": 1.5266599145384318e-07, "loss": 0.2752, "step": 2919 }, { "epoch": 0.83, "grad_norm": 2.240094198960351, "learning_rate": 1.5217894691949518e-07, "loss": 0.242, "step": 2920 }, { "epoch": 0.83, "grad_norm": 2.3749916729226386, "learning_rate": 1.5169261653625343e-07, "loss": 0.2798, "step": 2921 }, { "epoch": 0.83, "grad_norm": 2.7876388271161603, "learning_rate": 1.5120700071377212e-07, "loss": 0.2577, "step": 2922 }, { "epoch": 0.83, "grad_norm": 3.781441804581668, "learning_rate": 1.5072209986110373e-07, "loss": 0.272, "step": 2923 }, { "epoch": 0.83, "grad_norm": 2.4347166397268785, "learning_rate": 1.5023791438669797e-07, "loss": 0.2711, "step": 2924 }, { "epoch": 0.83, "grad_norm": 2.458491054092882, "learning_rate": 1.4975444469840238e-07, "loss": 0.3303, "step": 2925 }, { "epoch": 0.83, "grad_norm": 2.2599999748751727, "learning_rate": 1.492716912034614e-07, "loss": 0.223, "step": 2926 }, { "epoch": 0.83, "grad_norm": 2.3778895116764045, "learning_rate": 1.487896543085161e-07, "loss": 0.2316, "step": 2927 }, { "epoch": 0.83, "grad_norm": 2.2784230957221787, "learning_rate": 1.48308334419604e-07, "loss": 0.278, "step": 2928 }, { "epoch": 0.83, "grad_norm": 2.3700733375661387, "learning_rate": 1.4782773194215882e-07, "loss": 0.2806, "step": 2929 }, { "epoch": 0.83, "grad_norm": 2.2901041783321707, "learning_rate": 1.473478472810097e-07, "loss": 0.263, "step": 2930 }, { "epoch": 0.83, "grad_norm": 2.409600698001674, "learning_rate": 1.468686808403814e-07, "loss": 0.2804, "step": 2931 }, { "epoch": 0.83, "grad_norm": 2.5969716621663568, "learning_rate": 1.4639023302389364e-07, "loss": 0.2831, "step": 2932 }, { "epoch": 0.83, "grad_norm": 2.394297205291279, "learning_rate": 1.4591250423456046e-07, "loss": 0.2396, "step": 2933 }, { "epoch": 0.83, "grad_norm": 2.448689657248942, "learning_rate": 1.454354948747909e-07, "loss": 0.2427, "step": 2934 }, { "epoch": 0.83, "grad_norm": 2.438782418407605, "learning_rate": 1.449592053463874e-07, "loss": 0.2961, "step": 2935 }, { "epoch": 0.83, "grad_norm": 2.354138825480889, "learning_rate": 1.4448363605054636e-07, "loss": 0.2625, "step": 2936 }, { "epoch": 0.83, "grad_norm": 2.2592559986699303, "learning_rate": 1.440087873878574e-07, "loss": 0.2509, "step": 2937 }, { "epoch": 0.83, "grad_norm": 2.330818222709863, "learning_rate": 1.4353465975830336e-07, "loss": 0.2567, "step": 2938 }, { "epoch": 0.83, "grad_norm": 2.4420070625144805, "learning_rate": 1.4306125356125896e-07, "loss": 0.2838, "step": 2939 }, { "epoch": 0.83, "grad_norm": 2.2464288961476915, "learning_rate": 1.4258856919549232e-07, "loss": 0.2555, "step": 2940 }, { "epoch": 0.83, "grad_norm": 2.3803013873706544, "learning_rate": 1.4211660705916285e-07, "loss": 0.2361, "step": 2941 }, { "epoch": 0.83, "grad_norm": 2.410504833130406, "learning_rate": 1.4164536754982203e-07, "loss": 0.2519, "step": 2942 }, { "epoch": 0.83, "grad_norm": 2.7036576648926016, "learning_rate": 1.4117485106441186e-07, "loss": 0.2542, "step": 2943 }, { "epoch": 0.83, "grad_norm": 2.5126251019306363, "learning_rate": 1.407050579992658e-07, "loss": 0.2909, "step": 2944 }, { "epoch": 0.83, "grad_norm": 2.246164260692375, "learning_rate": 1.4023598875010844e-07, "loss": 0.264, "step": 2945 }, { "epoch": 0.83, "grad_norm": 2.268685222902053, "learning_rate": 1.3976764371205418e-07, "loss": 0.2696, "step": 2946 }, { "epoch": 0.84, "grad_norm": 2.504224124261662, "learning_rate": 1.39300023279607e-07, "loss": 0.2633, "step": 2947 }, { "epoch": 0.84, "grad_norm": 2.5660103800190637, "learning_rate": 1.388331278466609e-07, "loss": 0.2827, "step": 2948 }, { "epoch": 0.84, "grad_norm": 2.4725541982526758, "learning_rate": 1.3836695780649976e-07, "loss": 0.3022, "step": 2949 }, { "epoch": 0.84, "grad_norm": 2.446324380195441, "learning_rate": 1.379015135517958e-07, "loss": 0.2907, "step": 2950 }, { "epoch": 0.84, "grad_norm": 2.2514612224451906, "learning_rate": 1.374367954746094e-07, "loss": 0.2706, "step": 2951 }, { "epoch": 0.84, "grad_norm": 2.415819572419356, "learning_rate": 1.3697280396639034e-07, "loss": 0.2523, "step": 2952 }, { "epoch": 0.84, "grad_norm": 2.1459371173307846, "learning_rate": 1.365095394179754e-07, "loss": 0.2396, "step": 2953 }, { "epoch": 0.84, "grad_norm": 2.323396101986738, "learning_rate": 1.360470022195902e-07, "loss": 0.2739, "step": 2954 }, { "epoch": 0.84, "grad_norm": 2.3080472437525437, "learning_rate": 1.3558519276084635e-07, "loss": 0.2503, "step": 2955 }, { "epoch": 0.84, "grad_norm": 2.473968673368382, "learning_rate": 1.3512411143074332e-07, "loss": 0.296, "step": 2956 }, { "epoch": 0.84, "grad_norm": 2.3430591542502466, "learning_rate": 1.3466375861766698e-07, "loss": 0.2451, "step": 2957 }, { "epoch": 0.84, "grad_norm": 2.393077540550777, "learning_rate": 1.3420413470938942e-07, "loss": 0.2659, "step": 2958 }, { "epoch": 0.84, "grad_norm": 2.6086430834408345, "learning_rate": 1.3374524009306942e-07, "loss": 0.2641, "step": 2959 }, { "epoch": 0.84, "grad_norm": 2.621421328007362, "learning_rate": 1.332870751552503e-07, "loss": 0.2779, "step": 2960 }, { "epoch": 0.84, "grad_norm": 2.840293461456054, "learning_rate": 1.3282964028186172e-07, "loss": 0.3053, "step": 2961 }, { "epoch": 0.84, "grad_norm": 2.470822092275965, "learning_rate": 1.3237293585821785e-07, "loss": 0.2622, "step": 2962 }, { "epoch": 0.84, "grad_norm": 2.252474082136468, "learning_rate": 1.3191696226901795e-07, "loss": 0.2718, "step": 2963 }, { "epoch": 0.84, "grad_norm": 2.3074512131763516, "learning_rate": 1.314617198983454e-07, "loss": 0.243, "step": 2964 }, { "epoch": 0.84, "grad_norm": 2.512138084713408, "learning_rate": 1.3100720912966766e-07, "loss": 0.272, "step": 2965 }, { "epoch": 0.84, "grad_norm": 2.3739192198771737, "learning_rate": 1.305534303458361e-07, "loss": 0.2755, "step": 2966 }, { "epoch": 0.84, "grad_norm": 2.2893167627464166, "learning_rate": 1.301003839290853e-07, "loss": 0.2408, "step": 2967 }, { "epoch": 0.84, "grad_norm": 2.270167707949592, "learning_rate": 1.296480702610332e-07, "loss": 0.2501, "step": 2968 }, { "epoch": 0.84, "grad_norm": 2.3914772364832166, "learning_rate": 1.2919648972268027e-07, "loss": 0.2323, "step": 2969 }, { "epoch": 0.84, "grad_norm": 2.472406533385478, "learning_rate": 1.2874564269440958e-07, "loss": 0.3096, "step": 2970 }, { "epoch": 0.84, "grad_norm": 2.5115059991420203, "learning_rate": 1.2829552955598622e-07, "loss": 0.3056, "step": 2971 }, { "epoch": 0.84, "grad_norm": 2.1147586969077694, "learning_rate": 1.2784615068655745e-07, "loss": 0.2611, "step": 2972 }, { "epoch": 0.84, "grad_norm": 2.3814559744375745, "learning_rate": 1.273975064646512e-07, "loss": 0.2569, "step": 2973 }, { "epoch": 0.84, "grad_norm": 2.525852265833298, "learning_rate": 1.2694959726817767e-07, "loss": 0.2583, "step": 2974 }, { "epoch": 0.84, "grad_norm": 2.4224050505105277, "learning_rate": 1.2650242347442707e-07, "loss": 0.2269, "step": 2975 }, { "epoch": 0.84, "grad_norm": 2.3454068114753697, "learning_rate": 1.260559854600709e-07, "loss": 0.2746, "step": 2976 }, { "epoch": 0.84, "grad_norm": 2.279877000511415, "learning_rate": 1.2561028360116e-07, "loss": 0.255, "step": 2977 }, { "epoch": 0.84, "grad_norm": 2.416841526419076, "learning_rate": 1.251653182731254e-07, "loss": 0.2998, "step": 2978 }, { "epoch": 0.84, "grad_norm": 2.4618454435380572, "learning_rate": 1.2472108985077834e-07, "loss": 0.2692, "step": 2979 }, { "epoch": 0.84, "grad_norm": 2.5288543032906126, "learning_rate": 1.242775987083088e-07, "loss": 0.2515, "step": 2980 }, { "epoch": 0.84, "grad_norm": 2.307446477470878, "learning_rate": 1.23834845219286e-07, "loss": 0.2711, "step": 2981 }, { "epoch": 0.84, "grad_norm": 2.3118979472768664, "learning_rate": 1.233928297566571e-07, "loss": 0.2632, "step": 2982 }, { "epoch": 0.85, "grad_norm": 3.0975762766164716, "learning_rate": 1.2295155269274827e-07, "loss": 0.2617, "step": 2983 }, { "epoch": 0.85, "grad_norm": 2.252171146783034, "learning_rate": 1.225110143992638e-07, "loss": 0.296, "step": 2984 }, { "epoch": 0.85, "grad_norm": 2.699367516738382, "learning_rate": 1.220712152472856e-07, "loss": 0.3007, "step": 2985 }, { "epoch": 0.85, "grad_norm": 2.3609242051199897, "learning_rate": 1.2163215560727214e-07, "loss": 0.2853, "step": 2986 }, { "epoch": 0.85, "grad_norm": 2.239716626762453, "learning_rate": 1.2119383584905985e-07, "loss": 0.2527, "step": 2987 }, { "epoch": 0.85, "grad_norm": 2.281800284296882, "learning_rate": 1.2075625634186205e-07, "loss": 0.2509, "step": 2988 }, { "epoch": 0.85, "grad_norm": 2.416589421795546, "learning_rate": 1.203194174542682e-07, "loss": 0.2919, "step": 2989 }, { "epoch": 0.85, "grad_norm": 2.6405939356926527, "learning_rate": 1.1988331955424347e-07, "loss": 0.2755, "step": 2990 }, { "epoch": 0.85, "grad_norm": 4.045979386286837, "learning_rate": 1.194479630091294e-07, "loss": 0.2376, "step": 2991 }, { "epoch": 0.85, "grad_norm": 2.2126916596240935, "learning_rate": 1.190133481856429e-07, "loss": 0.2668, "step": 2992 }, { "epoch": 0.85, "grad_norm": 2.2755646904017857, "learning_rate": 1.1857947544987668e-07, "loss": 0.2777, "step": 2993 }, { "epoch": 0.85, "grad_norm": 2.431657462454105, "learning_rate": 1.1814634516729726e-07, "loss": 0.303, "step": 2994 }, { "epoch": 0.85, "grad_norm": 2.38579299772321, "learning_rate": 1.177139577027465e-07, "loss": 0.2728, "step": 2995 }, { "epoch": 0.85, "grad_norm": 2.1828610624233415, "learning_rate": 1.1728231342044049e-07, "loss": 0.2322, "step": 2996 }, { "epoch": 0.85, "grad_norm": 2.286268924458742, "learning_rate": 1.1685141268396902e-07, "loss": 0.2507, "step": 2997 }, { "epoch": 0.85, "grad_norm": 2.34834559713555, "learning_rate": 1.1642125585629592e-07, "loss": 0.2757, "step": 2998 }, { "epoch": 0.85, "grad_norm": 2.446898983893535, "learning_rate": 1.1599184329975809e-07, "loss": 0.2751, "step": 2999 }, { "epoch": 0.85, "grad_norm": 2.3969050527751103, "learning_rate": 1.1556317537606586e-07, "loss": 0.2653, "step": 3000 }, { "epoch": 0.85, "grad_norm": 2.2928562405437947, "learning_rate": 1.1513525244630196e-07, "loss": 0.2647, "step": 3001 }, { "epoch": 0.85, "grad_norm": 2.343027458930917, "learning_rate": 1.1470807487092171e-07, "loss": 0.2419, "step": 3002 }, { "epoch": 0.85, "grad_norm": 2.4596000456064733, "learning_rate": 1.1428164300975274e-07, "loss": 0.2695, "step": 3003 }, { "epoch": 0.85, "grad_norm": 2.1712507233695026, "learning_rate": 1.1385595722199437e-07, "loss": 0.2453, "step": 3004 }, { "epoch": 0.85, "grad_norm": 2.1791223390148597, "learning_rate": 1.1343101786621745e-07, "loss": 0.2565, "step": 3005 }, { "epoch": 0.85, "grad_norm": 2.2200097978674984, "learning_rate": 1.1300682530036432e-07, "loss": 0.2485, "step": 3006 }, { "epoch": 0.85, "grad_norm": 2.3623281275120664, "learning_rate": 1.1258337988174793e-07, "loss": 0.2473, "step": 3007 }, { "epoch": 0.85, "grad_norm": 2.315292263531697, "learning_rate": 1.1216068196705208e-07, "loss": 0.2621, "step": 3008 }, { "epoch": 0.85, "grad_norm": 2.3133663961978814, "learning_rate": 1.1173873191233096e-07, "loss": 0.2478, "step": 3009 }, { "epoch": 0.85, "grad_norm": 2.430708800052876, "learning_rate": 1.1131753007300881e-07, "loss": 0.2708, "step": 3010 }, { "epoch": 0.85, "grad_norm": 2.5155824409494305, "learning_rate": 1.1089707680387961e-07, "loss": 0.2741, "step": 3011 }, { "epoch": 0.85, "grad_norm": 2.9174229312451216, "learning_rate": 1.1047737245910615e-07, "loss": 0.275, "step": 3012 }, { "epoch": 0.85, "grad_norm": 2.369638242381517, "learning_rate": 1.1005841739222166e-07, "loss": 0.2588, "step": 3013 }, { "epoch": 0.85, "grad_norm": 2.2897156128120524, "learning_rate": 1.0964021195612728e-07, "loss": 0.2717, "step": 3014 }, { "epoch": 0.85, "grad_norm": 2.3830666042149384, "learning_rate": 1.0922275650309321e-07, "loss": 0.2664, "step": 3015 }, { "epoch": 0.85, "grad_norm": 2.3412654270408115, "learning_rate": 1.0880605138475707e-07, "loss": 0.2655, "step": 3016 }, { "epoch": 0.85, "grad_norm": 2.382093627470339, "learning_rate": 1.083900969521252e-07, "loss": 0.2944, "step": 3017 }, { "epoch": 0.86, "grad_norm": 2.492195238746543, "learning_rate": 1.0797489355557188e-07, "loss": 0.2826, "step": 3018 }, { "epoch": 0.86, "grad_norm": 2.328835974383053, "learning_rate": 1.0756044154483812e-07, "loss": 0.2666, "step": 3019 }, { "epoch": 0.86, "grad_norm": 2.2091174854967988, "learning_rate": 1.07146741269032e-07, "loss": 0.2796, "step": 3020 }, { "epoch": 0.86, "grad_norm": 2.2501063439867477, "learning_rate": 1.0673379307662855e-07, "loss": 0.2591, "step": 3021 }, { "epoch": 0.86, "grad_norm": 2.3549354214289524, "learning_rate": 1.0632159731546964e-07, "loss": 0.2486, "step": 3022 }, { "epoch": 0.86, "grad_norm": 2.2242715824156876, "learning_rate": 1.0591015433276306e-07, "loss": 0.2486, "step": 3023 }, { "epoch": 0.86, "grad_norm": 2.3033630244095527, "learning_rate": 1.054994644750824e-07, "loss": 0.2632, "step": 3024 }, { "epoch": 0.86, "grad_norm": 2.343530716837555, "learning_rate": 1.050895280883668e-07, "loss": 0.258, "step": 3025 }, { "epoch": 0.86, "grad_norm": 2.4582728084197667, "learning_rate": 1.0468034551792083e-07, "loss": 0.2733, "step": 3026 }, { "epoch": 0.86, "grad_norm": 5.114240863591207, "learning_rate": 1.0427191710841443e-07, "loss": 0.2787, "step": 3027 }, { "epoch": 0.86, "grad_norm": 2.8725573321328333, "learning_rate": 1.0386424320388209e-07, "loss": 0.2729, "step": 3028 }, { "epoch": 0.86, "grad_norm": 2.2117518738730926, "learning_rate": 1.0345732414772224e-07, "loss": 0.2437, "step": 3029 }, { "epoch": 0.86, "grad_norm": 2.4384125875736773, "learning_rate": 1.0305116028269812e-07, "loss": 0.2661, "step": 3030 }, { "epoch": 0.86, "grad_norm": 2.38059904238801, "learning_rate": 1.0264575195093628e-07, "loss": 0.2384, "step": 3031 }, { "epoch": 0.86, "grad_norm": 2.288709522342751, "learning_rate": 1.022410994939279e-07, "loss": 0.2578, "step": 3032 }, { "epoch": 0.86, "grad_norm": 2.309509735794672, "learning_rate": 1.0183720325252609e-07, "loss": 0.2495, "step": 3033 }, { "epoch": 0.86, "grad_norm": 2.3556405665317097, "learning_rate": 1.0143406356694795e-07, "loss": 0.2415, "step": 3034 }, { "epoch": 0.86, "grad_norm": 2.3462410688998205, "learning_rate": 1.0103168077677283e-07, "loss": 0.2824, "step": 3035 }, { "epoch": 0.86, "grad_norm": 2.1535339888759526, "learning_rate": 1.006300552209427e-07, "loss": 0.2374, "step": 3036 }, { "epoch": 0.86, "grad_norm": 2.3690379200829725, "learning_rate": 1.0022918723776175e-07, "loss": 0.2625, "step": 3037 }, { "epoch": 0.86, "grad_norm": 2.3032429338900373, "learning_rate": 9.982907716489586e-08, "loss": 0.272, "step": 3038 }, { "epoch": 0.86, "grad_norm": 2.608970378774197, "learning_rate": 9.942972533937266e-08, "loss": 0.3033, "step": 3039 }, { "epoch": 0.86, "grad_norm": 2.3119793826541364, "learning_rate": 9.903113209758096e-08, "loss": 0.2726, "step": 3040 }, { "epoch": 0.86, "grad_norm": 2.381000490542755, "learning_rate": 9.863329777527052e-08, "loss": 0.2681, "step": 3041 }, { "epoch": 0.86, "grad_norm": 2.4542599325019796, "learning_rate": 9.823622270755205e-08, "loss": 0.2564, "step": 3042 }, { "epoch": 0.86, "grad_norm": 2.4368086277548415, "learning_rate": 9.783990722889657e-08, "loss": 0.2614, "step": 3043 }, { "epoch": 0.86, "grad_norm": 2.644344757700451, "learning_rate": 9.744435167313536e-08, "loss": 0.2969, "step": 3044 }, { "epoch": 0.86, "grad_norm": 2.506641792481987, "learning_rate": 9.704955637345946e-08, "loss": 0.2606, "step": 3045 }, { "epoch": 0.86, "grad_norm": 2.428419743065912, "learning_rate": 9.665552166241964e-08, "loss": 0.2581, "step": 3046 }, { "epoch": 0.86, "grad_norm": 2.308346279825781, "learning_rate": 9.626224787192594e-08, "loss": 0.2351, "step": 3047 }, { "epoch": 0.86, "grad_norm": 2.4120125079732713, "learning_rate": 9.586973533324738e-08, "loss": 0.274, "step": 3048 }, { "epoch": 0.86, "grad_norm": 2.1444026166759813, "learning_rate": 9.547798437701193e-08, "loss": 0.2458, "step": 3049 }, { "epoch": 0.86, "grad_norm": 2.3280997441596494, "learning_rate": 9.508699533320597e-08, "loss": 0.2785, "step": 3050 }, { "epoch": 0.86, "grad_norm": 2.468914423283346, "learning_rate": 9.46967685311737e-08, "loss": 0.3056, "step": 3051 }, { "epoch": 0.86, "grad_norm": 2.251997965826707, "learning_rate": 9.430730429961808e-08, "loss": 0.2508, "step": 3052 }, { "epoch": 0.87, "grad_norm": 2.3898049144957327, "learning_rate": 9.391860296659915e-08, "loss": 0.2694, "step": 3053 }, { "epoch": 0.87, "grad_norm": 2.3671284513441533, "learning_rate": 9.353066485953454e-08, "loss": 0.2544, "step": 3054 }, { "epoch": 0.87, "grad_norm": 2.4648509691775216, "learning_rate": 9.314349030519842e-08, "loss": 0.2771, "step": 3055 }, { "epoch": 0.87, "grad_norm": 2.694199196787725, "learning_rate": 9.275707962972279e-08, "loss": 0.289, "step": 3056 }, { "epoch": 0.87, "grad_norm": 2.2566611611970706, "learning_rate": 9.237143315859552e-08, "loss": 0.2622, "step": 3057 }, { "epoch": 0.87, "grad_norm": 2.7771475571941817, "learning_rate": 9.19865512166611e-08, "loss": 0.2728, "step": 3058 }, { "epoch": 0.87, "grad_norm": 2.395294527739055, "learning_rate": 9.160243412811952e-08, "loss": 0.2783, "step": 3059 }, { "epoch": 0.87, "grad_norm": 2.342150068379509, "learning_rate": 9.121908221652674e-08, "loss": 0.2592, "step": 3060 }, { "epoch": 0.87, "grad_norm": 3.3201108482311854, "learning_rate": 9.083649580479491e-08, "loss": 0.2906, "step": 3061 }, { "epoch": 0.87, "grad_norm": 2.38692281040695, "learning_rate": 9.045467521519045e-08, "loss": 0.2888, "step": 3062 }, { "epoch": 0.87, "grad_norm": 5.2488795924967935, "learning_rate": 9.00736207693349e-08, "loss": 0.2834, "step": 3063 }, { "epoch": 0.87, "grad_norm": 2.5617539242024745, "learning_rate": 8.969333278820445e-08, "loss": 0.281, "step": 3064 }, { "epoch": 0.87, "grad_norm": 2.2762094504720207, "learning_rate": 8.931381159212981e-08, "loss": 0.272, "step": 3065 }, { "epoch": 0.87, "grad_norm": 2.397954895380229, "learning_rate": 8.893505750079622e-08, "loss": 0.2529, "step": 3066 }, { "epoch": 0.87, "grad_norm": 2.451818318013084, "learning_rate": 8.855707083324181e-08, "loss": 0.2849, "step": 3067 }, { "epoch": 0.87, "grad_norm": 2.330369584868695, "learning_rate": 8.817985190785882e-08, "loss": 0.2503, "step": 3068 }, { "epoch": 0.87, "grad_norm": 2.347047792958819, "learning_rate": 8.780340104239282e-08, "loss": 0.2951, "step": 3069 }, { "epoch": 0.87, "grad_norm": 2.4605820045070477, "learning_rate": 8.742771855394204e-08, "loss": 0.2863, "step": 3070 }, { "epoch": 0.87, "grad_norm": 2.346271559452563, "learning_rate": 8.705280475895848e-08, "loss": 0.2666, "step": 3071 }, { "epoch": 0.87, "grad_norm": 2.3650878311332355, "learning_rate": 8.66786599732453e-08, "loss": 0.2865, "step": 3072 }, { "epoch": 0.87, "grad_norm": 2.4735231589506377, "learning_rate": 8.630528451195873e-08, "loss": 0.2652, "step": 3073 }, { "epoch": 0.87, "grad_norm": 2.1078553801134716, "learning_rate": 8.593267868960674e-08, "loss": 0.2686, "step": 3074 }, { "epoch": 0.87, "grad_norm": 2.367953894551575, "learning_rate": 8.556084282004905e-08, "loss": 0.2442, "step": 3075 }, { "epoch": 0.87, "grad_norm": 2.6606227224227568, "learning_rate": 8.518977721649679e-08, "loss": 0.2446, "step": 3076 }, { "epoch": 0.87, "grad_norm": 6.555949455627507, "learning_rate": 8.481948219151225e-08, "loss": 0.3033, "step": 3077 }, { "epoch": 0.87, "grad_norm": 2.900102774963622, "learning_rate": 8.444995805700872e-08, "loss": 0.2805, "step": 3078 }, { "epoch": 0.87, "grad_norm": 2.2788626417574056, "learning_rate": 8.408120512424999e-08, "loss": 0.2559, "step": 3079 }, { "epoch": 0.87, "grad_norm": 2.240323058843195, "learning_rate": 8.371322370385048e-08, "loss": 0.2779, "step": 3080 }, { "epoch": 0.87, "grad_norm": 2.3898302231497137, "learning_rate": 8.334601410577436e-08, "loss": 0.2853, "step": 3081 }, { "epoch": 0.87, "grad_norm": 2.3482574141862327, "learning_rate": 8.297957663933608e-08, "loss": 0.2752, "step": 3082 }, { "epoch": 0.87, "grad_norm": 2.319169634223261, "learning_rate": 8.261391161319941e-08, "loss": 0.24, "step": 3083 }, { "epoch": 0.87, "grad_norm": 2.3155577880334555, "learning_rate": 8.224901933537776e-08, "loss": 0.2726, "step": 3084 }, { "epoch": 0.87, "grad_norm": 2.2189740851909225, "learning_rate": 8.18849001132329e-08, "loss": 0.2405, "step": 3085 }, { "epoch": 0.87, "grad_norm": 2.4746209392963223, "learning_rate": 8.15215542534765e-08, "loss": 0.268, "step": 3086 }, { "epoch": 0.87, "grad_norm": 2.180319378355328, "learning_rate": 8.115898206216798e-08, "loss": 0.2591, "step": 3087 }, { "epoch": 0.87, "grad_norm": 2.343287819286448, "learning_rate": 8.079718384471557e-08, "loss": 0.2634, "step": 3088 }, { "epoch": 0.88, "grad_norm": 2.357102540519274, "learning_rate": 8.043615990587494e-08, "loss": 0.2686, "step": 3089 }, { "epoch": 0.88, "grad_norm": 2.3110864751314604, "learning_rate": 8.007591054975016e-08, "loss": 0.2883, "step": 3090 }, { "epoch": 0.88, "grad_norm": 3.5614262632707647, "learning_rate": 7.971643607979273e-08, "loss": 0.2842, "step": 3091 }, { "epoch": 0.88, "grad_norm": 2.335392757844915, "learning_rate": 7.93577367988012e-08, "loss": 0.2757, "step": 3092 }, { "epoch": 0.88, "grad_norm": 2.3085814088282475, "learning_rate": 7.899981300892144e-08, "loss": 0.258, "step": 3093 }, { "epoch": 0.88, "grad_norm": 2.1934363925819382, "learning_rate": 7.86426650116454e-08, "loss": 0.2362, "step": 3094 }, { "epoch": 0.88, "grad_norm": 2.1467726540158205, "learning_rate": 7.828629310781265e-08, "loss": 0.2312, "step": 3095 }, { "epoch": 0.88, "grad_norm": 2.36570585706135, "learning_rate": 7.793069759760829e-08, "loss": 0.2702, "step": 3096 }, { "epoch": 0.88, "grad_norm": 2.3436174242214376, "learning_rate": 7.75758787805637e-08, "loss": 0.2409, "step": 3097 }, { "epoch": 0.88, "grad_norm": 2.430656309119998, "learning_rate": 7.722183695555562e-08, "loss": 0.2931, "step": 3098 }, { "epoch": 0.88, "grad_norm": 2.4954309256933866, "learning_rate": 7.686857242080669e-08, "loss": 0.2587, "step": 3099 }, { "epoch": 0.88, "grad_norm": 2.2823343280136217, "learning_rate": 7.651608547388489e-08, "loss": 0.2446, "step": 3100 }, { "epoch": 0.88, "grad_norm": 2.4514656994841872, "learning_rate": 7.616437641170315e-08, "loss": 0.2562, "step": 3101 }, { "epoch": 0.88, "grad_norm": 2.3284698907360495, "learning_rate": 7.581344553051871e-08, "loss": 0.2606, "step": 3102 }, { "epoch": 0.88, "grad_norm": 2.6800251316823456, "learning_rate": 7.54632931259338e-08, "loss": 0.2645, "step": 3103 }, { "epoch": 0.88, "grad_norm": 2.5123704990491103, "learning_rate": 7.51139194928947e-08, "loss": 0.2784, "step": 3104 }, { "epoch": 0.88, "grad_norm": 2.3995486614027737, "learning_rate": 7.47653249256922e-08, "loss": 0.2573, "step": 3105 }, { "epoch": 0.88, "grad_norm": 2.2445829265036874, "learning_rate": 7.44175097179599e-08, "loss": 0.2652, "step": 3106 }, { "epoch": 0.88, "grad_norm": 2.516997912739564, "learning_rate": 7.407047416267564e-08, "loss": 0.2722, "step": 3107 }, { "epoch": 0.88, "grad_norm": 2.197115715153637, "learning_rate": 7.372421855216037e-08, "loss": 0.251, "step": 3108 }, { "epoch": 0.88, "grad_norm": 2.3742140188304464, "learning_rate": 7.337874317807802e-08, "loss": 0.2825, "step": 3109 }, { "epoch": 0.88, "grad_norm": 2.1502066887265965, "learning_rate": 7.303404833143522e-08, "loss": 0.242, "step": 3110 }, { "epoch": 0.88, "grad_norm": 2.580142884634076, "learning_rate": 7.269013430258131e-08, "loss": 0.3137, "step": 3111 }, { "epoch": 0.88, "grad_norm": 2.3035826394343055, "learning_rate": 7.234700138120776e-08, "loss": 0.2367, "step": 3112 }, { "epoch": 0.88, "grad_norm": 2.3843530141893963, "learning_rate": 7.200464985634824e-08, "loss": 0.2524, "step": 3113 }, { "epoch": 0.88, "grad_norm": 2.3209743886788656, "learning_rate": 7.166308001637811e-08, "loss": 0.2899, "step": 3114 }, { "epoch": 0.88, "grad_norm": 2.3303005873390066, "learning_rate": 7.13222921490142e-08, "loss": 0.2637, "step": 3115 }, { "epoch": 0.88, "grad_norm": 2.426996076177478, "learning_rate": 7.098228654131488e-08, "loss": 0.2905, "step": 3116 }, { "epoch": 0.88, "grad_norm": 2.2879093080690045, "learning_rate": 7.064306347967952e-08, "loss": 0.292, "step": 3117 }, { "epoch": 0.88, "grad_norm": 2.337757585829966, "learning_rate": 7.03046232498482e-08, "loss": 0.248, "step": 3118 }, { "epoch": 0.88, "grad_norm": 2.603799211351658, "learning_rate": 6.996696613690156e-08, "loss": 0.2754, "step": 3119 }, { "epoch": 0.88, "grad_norm": 2.432035830910913, "learning_rate": 6.963009242526096e-08, "loss": 0.2708, "step": 3120 }, { "epoch": 0.88, "grad_norm": 2.4988923447017273, "learning_rate": 6.929400239868743e-08, "loss": 0.2578, "step": 3121 }, { "epoch": 0.88, "grad_norm": 2.1903315111152, "learning_rate": 6.895869634028217e-08, "loss": 0.2433, "step": 3122 }, { "epoch": 0.88, "grad_norm": 2.182167491069264, "learning_rate": 6.862417453248593e-08, "loss": 0.224, "step": 3123 }, { "epoch": 0.89, "grad_norm": 2.23234379480758, "learning_rate": 6.82904372570785e-08, "loss": 0.2661, "step": 3124 }, { "epoch": 0.89, "grad_norm": 2.1799081969004988, "learning_rate": 6.79574847951796e-08, "loss": 0.2723, "step": 3125 }, { "epoch": 0.89, "grad_norm": 2.6825595419677004, "learning_rate": 6.76253174272472e-08, "loss": 0.2784, "step": 3126 }, { "epoch": 0.89, "grad_norm": 2.5886007659552006, "learning_rate": 6.729393543307837e-08, "loss": 0.3001, "step": 3127 }, { "epoch": 0.89, "grad_norm": 2.4011926903124863, "learning_rate": 6.696333909180796e-08, "loss": 0.254, "step": 3128 }, { "epoch": 0.89, "grad_norm": 2.379893316621335, "learning_rate": 6.663352868191008e-08, "loss": 0.2617, "step": 3129 }, { "epoch": 0.89, "grad_norm": 2.6057157508985167, "learning_rate": 6.630450448119617e-08, "loss": 0.2767, "step": 3130 }, { "epoch": 0.89, "grad_norm": 2.3009825684020466, "learning_rate": 6.597626676681545e-08, "loss": 0.2459, "step": 3131 }, { "epoch": 0.89, "grad_norm": 2.3740096091255003, "learning_rate": 6.564881581525449e-08, "loss": 0.2746, "step": 3132 }, { "epoch": 0.89, "grad_norm": 2.665005965150247, "learning_rate": 6.532215190233747e-08, "loss": 0.3207, "step": 3133 }, { "epoch": 0.89, "grad_norm": 2.319434599678059, "learning_rate": 6.499627530322582e-08, "loss": 0.2632, "step": 3134 }, { "epoch": 0.89, "grad_norm": 2.684205728247248, "learning_rate": 6.467118629241718e-08, "loss": 0.2644, "step": 3135 }, { "epoch": 0.89, "grad_norm": 2.44892949559697, "learning_rate": 6.434688514374632e-08, "loss": 0.262, "step": 3136 }, { "epoch": 0.89, "grad_norm": 2.3548233221374733, "learning_rate": 6.402337213038378e-08, "loss": 0.2764, "step": 3137 }, { "epoch": 0.89, "grad_norm": 2.480068951853458, "learning_rate": 6.370064752483661e-08, "loss": 0.2784, "step": 3138 }, { "epoch": 0.89, "grad_norm": 2.4592994564515664, "learning_rate": 6.337871159894803e-08, "loss": 0.2834, "step": 3139 }, { "epoch": 0.89, "grad_norm": 2.2792956091900307, "learning_rate": 6.305756462389644e-08, "loss": 0.2547, "step": 3140 }, { "epoch": 0.89, "grad_norm": 2.4306357067447553, "learning_rate": 6.273720687019579e-08, "loss": 0.2767, "step": 3141 }, { "epoch": 0.89, "grad_norm": 2.379110567627786, "learning_rate": 6.241763860769534e-08, "loss": 0.2725, "step": 3142 }, { "epoch": 0.89, "grad_norm": 2.2710952274288494, "learning_rate": 6.209886010557907e-08, "loss": 0.2485, "step": 3143 }, { "epoch": 0.89, "grad_norm": 2.5652561077781746, "learning_rate": 6.178087163236645e-08, "loss": 0.252, "step": 3144 }, { "epoch": 0.89, "grad_norm": 2.2341453207739805, "learning_rate": 6.146367345591053e-08, "loss": 0.253, "step": 3145 }, { "epoch": 0.89, "grad_norm": 2.3158832187436107, "learning_rate": 6.114726584339913e-08, "loss": 0.2655, "step": 3146 }, { "epoch": 0.89, "grad_norm": 2.4445635266013626, "learning_rate": 6.08316490613543e-08, "loss": 0.2669, "step": 3147 }, { "epoch": 0.89, "grad_norm": 2.304129154768925, "learning_rate": 6.051682337563158e-08, "loss": 0.2705, "step": 3148 }, { "epoch": 0.89, "grad_norm": 2.5757049619729018, "learning_rate": 6.02027890514204e-08, "loss": 0.2662, "step": 3149 }, { "epoch": 0.89, "grad_norm": 2.565737017154383, "learning_rate": 5.988954635324351e-08, "loss": 0.3128, "step": 3150 }, { "epoch": 0.89, "grad_norm": 2.3260226124352856, "learning_rate": 5.957709554495682e-08, "loss": 0.2633, "step": 3151 }, { "epoch": 0.89, "grad_norm": 2.3858294402669435, "learning_rate": 5.926543688974928e-08, "loss": 0.264, "step": 3152 }, { "epoch": 0.89, "grad_norm": 5.961869575469567, "learning_rate": 5.8954570650142424e-08, "loss": 0.2629, "step": 3153 }, { "epoch": 0.89, "grad_norm": 2.474005827500162, "learning_rate": 5.864449708799057e-08, "loss": 0.2641, "step": 3154 }, { "epoch": 0.89, "grad_norm": 2.4705398326799717, "learning_rate": 5.833521646448003e-08, "loss": 0.2926, "step": 3155 }, { "epoch": 0.89, "grad_norm": 2.319310223557441, "learning_rate": 5.8026729040129506e-08, "loss": 0.2458, "step": 3156 }, { "epoch": 0.89, "grad_norm": 2.411383748029571, "learning_rate": 5.771903507478915e-08, "loss": 0.3025, "step": 3157 }, { "epoch": 0.89, "grad_norm": 6.037524189709264, "learning_rate": 5.741213482764118e-08, "loss": 0.2661, "step": 3158 }, { "epoch": 0.9, "grad_norm": 2.534264378285265, "learning_rate": 5.7106028557199036e-08, "loss": 0.2815, "step": 3159 }, { "epoch": 0.9, "grad_norm": 2.4799284033923366, "learning_rate": 5.6800716521307356e-08, "loss": 0.2857, "step": 3160 }, { "epoch": 0.9, "grad_norm": 2.4307602896415217, "learning_rate": 5.649619897714186e-08, "loss": 0.2595, "step": 3161 }, { "epoch": 0.9, "grad_norm": 2.309911730440573, "learning_rate": 5.61924761812087e-08, "loss": 0.2572, "step": 3162 }, { "epoch": 0.9, "grad_norm": 2.2708443975298596, "learning_rate": 5.588954838934523e-08, "loss": 0.2263, "step": 3163 }, { "epoch": 0.9, "grad_norm": 2.3928996224840224, "learning_rate": 5.558741585671845e-08, "loss": 0.262, "step": 3164 }, { "epoch": 0.9, "grad_norm": 2.400883941359423, "learning_rate": 5.528607883782599e-08, "loss": 0.2699, "step": 3165 }, { "epoch": 0.9, "grad_norm": 2.3632036616640164, "learning_rate": 5.4985537586495157e-08, "loss": 0.2488, "step": 3166 }, { "epoch": 0.9, "grad_norm": 2.4002704113750326, "learning_rate": 5.4685792355882664e-08, "loss": 0.2626, "step": 3167 }, { "epoch": 0.9, "grad_norm": 2.4063425935418663, "learning_rate": 5.438684339847555e-08, "loss": 0.2591, "step": 3168 }, { "epoch": 0.9, "grad_norm": 2.430614312790665, "learning_rate": 5.4088690966089254e-08, "loss": 0.2611, "step": 3169 }, { "epoch": 0.9, "grad_norm": 2.419754243863106, "learning_rate": 5.379133530986901e-08, "loss": 0.2727, "step": 3170 }, { "epoch": 0.9, "grad_norm": 2.4662220396431747, "learning_rate": 5.349477668028801e-08, "loss": 0.283, "step": 3171 }, { "epoch": 0.9, "grad_norm": 2.3426309066637545, "learning_rate": 5.319901532714877e-08, "loss": 0.2701, "step": 3172 }, { "epoch": 0.9, "grad_norm": 2.4592232084554415, "learning_rate": 5.2904051499582105e-08, "loss": 0.2341, "step": 3173 }, { "epoch": 0.9, "grad_norm": 2.46097236692037, "learning_rate": 5.2609885446047165e-08, "loss": 0.2851, "step": 3174 }, { "epoch": 0.9, "grad_norm": 2.320945269240666, "learning_rate": 5.231651741433063e-08, "loss": 0.2515, "step": 3175 }, { "epoch": 0.9, "grad_norm": 2.6406931382496315, "learning_rate": 5.2023947651547275e-08, "loss": 0.2882, "step": 3176 }, { "epoch": 0.9, "grad_norm": 2.3858195376057347, "learning_rate": 5.17321764041394e-08, "loss": 0.249, "step": 3177 }, { "epoch": 0.9, "grad_norm": 2.2130979645960913, "learning_rate": 5.144120391787732e-08, "loss": 0.2428, "step": 3178 }, { "epoch": 0.9, "grad_norm": 2.4254984080961957, "learning_rate": 5.115103043785718e-08, "loss": 0.2485, "step": 3179 }, { "epoch": 0.9, "grad_norm": 2.3045501628031704, "learning_rate": 5.086165620850336e-08, "loss": 0.2768, "step": 3180 }, { "epoch": 0.9, "grad_norm": 2.2144689483497935, "learning_rate": 5.0573081473566315e-08, "loss": 0.2846, "step": 3181 }, { "epoch": 0.9, "grad_norm": 2.3229900518667357, "learning_rate": 5.028530647612306e-08, "loss": 0.2937, "step": 3182 }, { "epoch": 0.9, "grad_norm": 2.480173992557888, "learning_rate": 4.999833145857768e-08, "loss": 0.2651, "step": 3183 }, { "epoch": 0.9, "grad_norm": 2.2829017879265145, "learning_rate": 4.971215666265938e-08, "loss": 0.2393, "step": 3184 }, { "epoch": 0.9, "grad_norm": 2.698922135891398, "learning_rate": 4.942678232942399e-08, "loss": 0.2879, "step": 3185 }, { "epoch": 0.9, "grad_norm": 2.409469865808042, "learning_rate": 4.9142208699252893e-08, "loss": 0.2864, "step": 3186 }, { "epoch": 0.9, "grad_norm": 2.405615517945202, "learning_rate": 4.885843601185291e-08, "loss": 0.2805, "step": 3187 }, { "epoch": 0.9, "grad_norm": 2.347723176787636, "learning_rate": 4.857546450625649e-08, "loss": 0.2615, "step": 3188 }, { "epoch": 0.9, "grad_norm": 2.2570397287056916, "learning_rate": 4.8293294420820754e-08, "loss": 0.255, "step": 3189 }, { "epoch": 0.9, "grad_norm": 2.386877051850636, "learning_rate": 4.801192599322834e-08, "loss": 0.2543, "step": 3190 }, { "epoch": 0.9, "grad_norm": 2.4355257643620947, "learning_rate": 4.773135946048601e-08, "loss": 0.2548, "step": 3191 }, { "epoch": 0.9, "grad_norm": 2.383789499942686, "learning_rate": 4.7451595058925594e-08, "loss": 0.2642, "step": 3192 }, { "epoch": 0.9, "grad_norm": 2.22476355531068, "learning_rate": 4.717263302420282e-08, "loss": 0.2319, "step": 3193 }, { "epoch": 0.91, "grad_norm": 2.3890981659570376, "learning_rate": 4.689447359129794e-08, "loss": 0.2672, "step": 3194 }, { "epoch": 0.91, "grad_norm": 2.4735293733403685, "learning_rate": 4.661711699451476e-08, "loss": 0.2716, "step": 3195 }, { "epoch": 0.91, "grad_norm": 2.225349534764869, "learning_rate": 4.6340563467481164e-08, "loss": 0.2744, "step": 3196 }, { "epoch": 0.91, "grad_norm": 2.450129758872958, "learning_rate": 4.606481324314848e-08, "loss": 0.2513, "step": 3197 }, { "epoch": 0.91, "grad_norm": 2.2568166974492883, "learning_rate": 4.5789866553791245e-08, "loss": 0.2402, "step": 3198 }, { "epoch": 0.91, "grad_norm": 2.633689105888111, "learning_rate": 4.551572363100731e-08, "loss": 0.2336, "step": 3199 }, { "epoch": 0.91, "grad_norm": 2.3558166985162363, "learning_rate": 4.52423847057174e-08, "loss": 0.2382, "step": 3200 }, { "epoch": 0.91, "grad_norm": 2.4980777479857506, "learning_rate": 4.496985000816489e-08, "loss": 0.3011, "step": 3201 }, { "epoch": 0.91, "grad_norm": 2.4025661382784196, "learning_rate": 4.469811976791604e-08, "loss": 0.3135, "step": 3202 }, { "epoch": 0.91, "grad_norm": 2.288906258196443, "learning_rate": 4.442719421385921e-08, "loss": 0.2783, "step": 3203 }, { "epoch": 0.91, "grad_norm": 2.3401562728012664, "learning_rate": 4.415707357420517e-08, "loss": 0.2381, "step": 3204 }, { "epoch": 0.91, "grad_norm": 2.2560055476064536, "learning_rate": 4.388775807648659e-08, "loss": 0.2676, "step": 3205 }, { "epoch": 0.91, "grad_norm": 2.307661722100612, "learning_rate": 4.3619247947557445e-08, "loss": 0.2451, "step": 3206 }, { "epoch": 0.91, "grad_norm": 2.748113485649648, "learning_rate": 4.3351543413594263e-08, "loss": 0.2583, "step": 3207 }, { "epoch": 0.91, "grad_norm": 2.2513931431439382, "learning_rate": 4.308464470009432e-08, "loss": 0.2774, "step": 3208 }, { "epoch": 0.91, "grad_norm": 2.502034598201456, "learning_rate": 4.2818552031876454e-08, "loss": 0.2926, "step": 3209 }, { "epoch": 0.91, "grad_norm": 2.291150374878783, "learning_rate": 4.2553265633080146e-08, "loss": 0.2662, "step": 3210 }, { "epoch": 0.91, "grad_norm": 2.5215967746639354, "learning_rate": 4.228878572716588e-08, "loss": 0.266, "step": 3211 }, { "epoch": 0.91, "grad_norm": 2.349662419335486, "learning_rate": 4.202511253691521e-08, "loss": 0.2708, "step": 3212 }, { "epoch": 0.91, "grad_norm": 2.3748199771039564, "learning_rate": 4.176224628442981e-08, "loss": 0.2805, "step": 3213 }, { "epoch": 0.91, "grad_norm": 2.3378089840973058, "learning_rate": 4.150018719113147e-08, "loss": 0.2765, "step": 3214 }, { "epoch": 0.91, "grad_norm": 2.4869406885817775, "learning_rate": 4.123893547776236e-08, "loss": 0.2708, "step": 3215 }, { "epoch": 0.91, "grad_norm": 2.3636653162321783, "learning_rate": 4.097849136438436e-08, "loss": 0.2616, "step": 3216 }, { "epoch": 0.91, "grad_norm": 2.322882191550021, "learning_rate": 4.071885507037953e-08, "loss": 0.2479, "step": 3217 }, { "epoch": 0.91, "grad_norm": 2.6038193510966265, "learning_rate": 4.0460026814448934e-08, "loss": 0.3161, "step": 3218 }, { "epoch": 0.91, "grad_norm": 2.9175710529853762, "learning_rate": 4.0202006814613165e-08, "loss": 0.3271, "step": 3219 }, { "epoch": 0.91, "grad_norm": 2.2335773040988847, "learning_rate": 3.994479528821204e-08, "loss": 0.2589, "step": 3220 }, { "epoch": 0.91, "grad_norm": 2.29518040874271, "learning_rate": 3.9688392451904475e-08, "loss": 0.2662, "step": 3221 }, { "epoch": 0.91, "grad_norm": 2.4345206626996085, "learning_rate": 3.943279852166803e-08, "loss": 0.2724, "step": 3222 }, { "epoch": 0.91, "grad_norm": 2.619129736310827, "learning_rate": 3.917801371279894e-08, "loss": 0.2775, "step": 3223 }, { "epoch": 0.91, "grad_norm": 2.2320266622210374, "learning_rate": 3.8924038239911975e-08, "loss": 0.2542, "step": 3224 }, { "epoch": 0.91, "grad_norm": 2.4111141473121123, "learning_rate": 3.8670872316939885e-08, "loss": 0.2827, "step": 3225 }, { "epoch": 0.91, "grad_norm": 2.453538199319764, "learning_rate": 3.841851615713398e-08, "loss": 0.3011, "step": 3226 }, { "epoch": 0.91, "grad_norm": 2.3560675605285444, "learning_rate": 3.816696997306301e-08, "loss": 0.2999, "step": 3227 }, { "epoch": 0.91, "grad_norm": 2.500501438703021, "learning_rate": 3.79162339766137e-08, "loss": 0.2847, "step": 3228 }, { "epoch": 0.91, "grad_norm": 2.4567324123602363, "learning_rate": 3.766630837899032e-08, "loss": 0.2924, "step": 3229 }, { "epoch": 0.92, "grad_norm": 2.369206056840079, "learning_rate": 3.7417193390714476e-08, "loss": 0.2833, "step": 3230 }, { "epoch": 0.92, "grad_norm": 2.3537524798078473, "learning_rate": 3.716888922162487e-08, "loss": 0.2884, "step": 3231 }, { "epoch": 0.92, "grad_norm": 2.2654121630396, "learning_rate": 3.692139608087741e-08, "loss": 0.2761, "step": 3232 }, { "epoch": 0.92, "grad_norm": 2.367419390871166, "learning_rate": 3.667471417694468e-08, "loss": 0.2457, "step": 3233 }, { "epoch": 0.92, "grad_norm": 2.2877453429907635, "learning_rate": 3.642884371761601e-08, "loss": 0.254, "step": 3234 }, { "epoch": 0.92, "grad_norm": 2.573930551823851, "learning_rate": 3.6183784909997187e-08, "loss": 0.2634, "step": 3235 }, { "epoch": 0.92, "grad_norm": 2.366805500727652, "learning_rate": 3.593953796051041e-08, "loss": 0.257, "step": 3236 }, { "epoch": 0.92, "grad_norm": 5.863808786508515, "learning_rate": 3.5696103074893793e-08, "loss": 0.2966, "step": 3237 }, { "epoch": 0.92, "grad_norm": 2.566493498708618, "learning_rate": 3.545348045820173e-08, "loss": 0.2675, "step": 3238 }, { "epoch": 0.92, "grad_norm": 2.4598395736526837, "learning_rate": 3.521167031480432e-08, "loss": 0.2886, "step": 3239 }, { "epoch": 0.92, "grad_norm": 2.2523848463832543, "learning_rate": 3.497067284838673e-08, "loss": 0.2659, "step": 3240 }, { "epoch": 0.92, "grad_norm": 2.4175830736828057, "learning_rate": 3.4730488261950574e-08, "loss": 0.2866, "step": 3241 }, { "epoch": 0.92, "grad_norm": 2.307963198877353, "learning_rate": 3.449111675781202e-08, "loss": 0.2474, "step": 3242 }, { "epoch": 0.92, "grad_norm": 2.220977889209224, "learning_rate": 3.4252558537602786e-08, "loss": 0.2501, "step": 3243 }, { "epoch": 0.92, "grad_norm": 2.3093402757426995, "learning_rate": 3.401481380226889e-08, "loss": 0.2674, "step": 3244 }, { "epoch": 0.92, "grad_norm": 2.1868144442873, "learning_rate": 3.3777882752071715e-08, "loss": 0.2605, "step": 3245 }, { "epoch": 0.92, "grad_norm": 2.219309920824285, "learning_rate": 3.354176558658728e-08, "loss": 0.2621, "step": 3246 }, { "epoch": 0.92, "grad_norm": 2.5444203511053654, "learning_rate": 3.33064625047057e-08, "loss": 0.2821, "step": 3247 }, { "epoch": 0.92, "grad_norm": 2.4616798789610947, "learning_rate": 3.307197370463133e-08, "loss": 0.2793, "step": 3248 }, { "epoch": 0.92, "grad_norm": 2.300724708712612, "learning_rate": 3.283829938388294e-08, "loss": 0.2561, "step": 3249 }, { "epoch": 0.92, "grad_norm": 3.3329227624540954, "learning_rate": 3.260543973929286e-08, "loss": 0.2779, "step": 3250 }, { "epoch": 0.92, "grad_norm": 2.3194303907345537, "learning_rate": 3.237339496700775e-08, "loss": 0.2392, "step": 3251 }, { "epoch": 0.92, "grad_norm": 2.602542644339946, "learning_rate": 3.2142165262487365e-08, "loss": 0.3197, "step": 3252 }, { "epoch": 0.92, "grad_norm": 2.38983381123609, "learning_rate": 3.1911750820505015e-08, "loss": 0.2636, "step": 3253 }, { "epoch": 0.92, "grad_norm": 2.1980480700182974, "learning_rate": 3.168215183514733e-08, "loss": 0.2387, "step": 3254 }, { "epoch": 0.92, "grad_norm": 2.278995719811263, "learning_rate": 3.145336849981395e-08, "loss": 0.2432, "step": 3255 }, { "epoch": 0.92, "grad_norm": 2.3941204185182294, "learning_rate": 3.1225401007217934e-08, "loss": 0.2821, "step": 3256 }, { "epoch": 0.92, "grad_norm": 2.36472481649528, "learning_rate": 3.0998249549384346e-08, "loss": 0.2432, "step": 3257 }, { "epoch": 0.92, "grad_norm": 2.261121367712428, "learning_rate": 3.077191431765147e-08, "loss": 0.266, "step": 3258 }, { "epoch": 0.92, "grad_norm": 2.440840562496626, "learning_rate": 3.0546395502669795e-08, "loss": 0.2726, "step": 3259 }, { "epoch": 0.92, "grad_norm": 2.416700603257129, "learning_rate": 3.032169329440226e-08, "loss": 0.278, "step": 3260 }, { "epoch": 0.92, "grad_norm": 2.9480803404548483, "learning_rate": 3.009780788212379e-08, "loss": 0.2961, "step": 3261 }, { "epoch": 0.92, "grad_norm": 2.3246552045964135, "learning_rate": 2.9874739454421424e-08, "loss": 0.2645, "step": 3262 }, { "epoch": 0.92, "grad_norm": 2.2128528098505083, "learning_rate": 2.965248819919397e-08, "loss": 0.2527, "step": 3263 }, { "epoch": 0.92, "grad_norm": 2.2228799005968125, "learning_rate": 2.943105430365178e-08, "loss": 0.2694, "step": 3264 }, { "epoch": 0.93, "grad_norm": 2.3867533631022613, "learning_rate": 2.921043795431699e-08, "loss": 0.2819, "step": 3265 }, { "epoch": 0.93, "grad_norm": 2.3560004236511376, "learning_rate": 2.8990639337022838e-08, "loss": 0.2677, "step": 3266 }, { "epoch": 0.93, "grad_norm": 2.484908011066342, "learning_rate": 2.8771658636913886e-08, "loss": 0.2858, "step": 3267 }, { "epoch": 0.93, "grad_norm": 2.414734426146304, "learning_rate": 2.85534960384457e-08, "loss": 0.2608, "step": 3268 }, { "epoch": 0.93, "grad_norm": 2.384125145372157, "learning_rate": 2.8336151725384727e-08, "loss": 0.238, "step": 3269 }, { "epoch": 0.93, "grad_norm": 2.539193357795301, "learning_rate": 2.8119625880808183e-08, "loss": 0.2764, "step": 3270 }, { "epoch": 0.93, "grad_norm": 2.383186119315387, "learning_rate": 2.7903918687103733e-08, "loss": 0.2722, "step": 3271 }, { "epoch": 0.93, "grad_norm": 2.1790139505551247, "learning_rate": 2.7689030325969476e-08, "loss": 0.2391, "step": 3272 }, { "epoch": 0.93, "grad_norm": 2.7517357675515552, "learning_rate": 2.7474960978414064e-08, "loss": 0.2607, "step": 3273 }, { "epoch": 0.93, "grad_norm": 2.4227957748137516, "learning_rate": 2.7261710824755812e-08, "loss": 0.2766, "step": 3274 }, { "epoch": 0.93, "grad_norm": 2.236432074590604, "learning_rate": 2.704928004462337e-08, "loss": 0.2739, "step": 3275 }, { "epoch": 0.93, "grad_norm": 4.958590582391775, "learning_rate": 2.683766881695504e-08, "loss": 0.2751, "step": 3276 }, { "epoch": 0.93, "grad_norm": 2.3924049229226285, "learning_rate": 2.6626877319998798e-08, "loss": 0.2794, "step": 3277 }, { "epoch": 0.93, "grad_norm": 2.3948885814981016, "learning_rate": 2.641690573131228e-08, "loss": 0.2757, "step": 3278 }, { "epoch": 0.93, "grad_norm": 2.2237442319076095, "learning_rate": 2.6207754227761892e-08, "loss": 0.26, "step": 3279 }, { "epoch": 0.93, "grad_norm": 2.2420191639484632, "learning_rate": 2.5999422985524157e-08, "loss": 0.2707, "step": 3280 }, { "epoch": 0.93, "grad_norm": 2.3290621853224764, "learning_rate": 2.579191218008403e-08, "loss": 0.2739, "step": 3281 }, { "epoch": 0.93, "grad_norm": 2.347622446698043, "learning_rate": 2.5585221986235693e-08, "loss": 0.2533, "step": 3282 }, { "epoch": 0.93, "grad_norm": 2.323131153292077, "learning_rate": 2.537935257808177e-08, "loss": 0.272, "step": 3283 }, { "epoch": 0.93, "grad_norm": 2.318533312951456, "learning_rate": 2.5174304129033653e-08, "loss": 0.2675, "step": 3284 }, { "epoch": 0.93, "grad_norm": 2.481506357351729, "learning_rate": 2.4970076811811513e-08, "loss": 0.2553, "step": 3285 }, { "epoch": 0.93, "grad_norm": 3.351142696730123, "learning_rate": 2.4766670798443412e-08, "loss": 0.2712, "step": 3286 }, { "epoch": 0.93, "grad_norm": 2.319674755862298, "learning_rate": 2.4564086260265847e-08, "loss": 0.2722, "step": 3287 }, { "epoch": 0.93, "grad_norm": 2.387010165950708, "learning_rate": 2.436232336792321e-08, "loss": 0.2832, "step": 3288 }, { "epoch": 0.93, "grad_norm": 2.3906373099226284, "learning_rate": 2.416138229136777e-08, "loss": 0.2621, "step": 3289 }, { "epoch": 0.93, "grad_norm": 2.347750085520224, "learning_rate": 2.3961263199859915e-08, "loss": 0.2609, "step": 3290 }, { "epoch": 0.93, "grad_norm": 2.291141282114224, "learning_rate": 2.3761966261967247e-08, "loss": 0.2441, "step": 3291 }, { "epoch": 0.93, "grad_norm": 2.3569274367279274, "learning_rate": 2.3563491645564925e-08, "loss": 0.2671, "step": 3292 }, { "epoch": 0.93, "grad_norm": 2.3740086526710327, "learning_rate": 2.336583951783555e-08, "loss": 0.2842, "step": 3293 }, { "epoch": 0.93, "grad_norm": 2.245727364756435, "learning_rate": 2.3169010045268723e-08, "loss": 0.2671, "step": 3294 }, { "epoch": 0.93, "grad_norm": 2.3848931241980336, "learning_rate": 2.2973003393661372e-08, "loss": 0.2561, "step": 3295 }, { "epoch": 0.93, "grad_norm": 2.368790604055993, "learning_rate": 2.2777819728116988e-08, "loss": 0.2614, "step": 3296 }, { "epoch": 0.93, "grad_norm": 2.5764287797220984, "learning_rate": 2.2583459213046162e-08, "loss": 0.288, "step": 3297 }, { "epoch": 0.93, "grad_norm": 2.8656696962228563, "learning_rate": 2.238992201216594e-08, "loss": 0.2705, "step": 3298 }, { "epoch": 0.93, "grad_norm": 2.344589554713833, "learning_rate": 2.219720828849969e-08, "loss": 0.2329, "step": 3299 }, { "epoch": 0.94, "grad_norm": 2.388564658275402, "learning_rate": 2.2005318204377565e-08, "loss": 0.289, "step": 3300 }, { "epoch": 0.94, "grad_norm": 2.5719941834626807, "learning_rate": 2.18142519214356e-08, "loss": 0.3029, "step": 3301 }, { "epoch": 0.94, "grad_norm": 2.4035422211888378, "learning_rate": 2.1624009600616056e-08, "loss": 0.2601, "step": 3302 }, { "epoch": 0.94, "grad_norm": 2.5636385948099423, "learning_rate": 2.1434591402166967e-08, "loss": 0.286, "step": 3303 }, { "epoch": 0.94, "grad_norm": 2.3912117460001006, "learning_rate": 2.1245997485642485e-08, "loss": 0.2394, "step": 3304 }, { "epoch": 0.94, "grad_norm": 2.5857138602537892, "learning_rate": 2.1058228009902092e-08, "loss": 0.2813, "step": 3305 }, { "epoch": 0.94, "grad_norm": 2.2418195013942563, "learning_rate": 2.087128313311115e-08, "loss": 0.2754, "step": 3306 }, { "epoch": 0.94, "grad_norm": 2.411005422972666, "learning_rate": 2.0685163012740036e-08, "loss": 0.2702, "step": 3307 }, { "epoch": 0.94, "grad_norm": 2.3891746886806846, "learning_rate": 2.0499867805564784e-08, "loss": 0.2797, "step": 3308 }, { "epoch": 0.94, "grad_norm": 2.2603837327608254, "learning_rate": 2.0315397667666433e-08, "loss": 0.2452, "step": 3309 }, { "epoch": 0.94, "grad_norm": 2.4929320500854577, "learning_rate": 2.013175275443102e-08, "loss": 0.2558, "step": 3310 }, { "epoch": 0.94, "grad_norm": 2.569810976957224, "learning_rate": 1.9948933220549248e-08, "loss": 0.2974, "step": 3311 }, { "epoch": 0.94, "grad_norm": 2.35226642441117, "learning_rate": 1.9766939220017153e-08, "loss": 0.2618, "step": 3312 }, { "epoch": 0.94, "grad_norm": 2.3079105807378486, "learning_rate": 1.9585770906134668e-08, "loss": 0.2746, "step": 3313 }, { "epoch": 0.94, "grad_norm": 2.3644399960317277, "learning_rate": 1.940542843150683e-08, "loss": 0.2819, "step": 3314 }, { "epoch": 0.94, "grad_norm": 2.396111835884552, "learning_rate": 1.9225911948042683e-08, "loss": 0.2613, "step": 3315 }, { "epoch": 0.94, "grad_norm": 2.3463888293024384, "learning_rate": 1.9047221606955712e-08, "loss": 0.2504, "step": 3316 }, { "epoch": 0.94, "grad_norm": 2.3679034793186093, "learning_rate": 1.886935755876329e-08, "loss": 0.2733, "step": 3317 }, { "epoch": 0.94, "grad_norm": 2.259794421485142, "learning_rate": 1.8692319953286906e-08, "loss": 0.2703, "step": 3318 }, { "epoch": 0.94, "grad_norm": 2.3388390277874422, "learning_rate": 1.8516108939651943e-08, "loss": 0.2855, "step": 3319 }, { "epoch": 0.94, "grad_norm": 2.543297868344573, "learning_rate": 1.8340724666287555e-08, "loss": 0.2757, "step": 3320 }, { "epoch": 0.94, "grad_norm": 2.254819797018417, "learning_rate": 1.816616728092646e-08, "loss": 0.2511, "step": 3321 }, { "epoch": 0.94, "grad_norm": 2.3420892520736505, "learning_rate": 1.7992436930604483e-08, "loss": 0.2922, "step": 3322 }, { "epoch": 0.94, "grad_norm": 2.446367425978593, "learning_rate": 1.7819533761661344e-08, "loss": 0.2647, "step": 3323 }, { "epoch": 0.94, "grad_norm": 2.4262242704514434, "learning_rate": 1.7647457919739872e-08, "loss": 0.2687, "step": 3324 }, { "epoch": 0.94, "grad_norm": 2.3242793523904606, "learning_rate": 1.7476209549785903e-08, "loss": 0.2397, "step": 3325 }, { "epoch": 0.94, "grad_norm": 2.426182963897848, "learning_rate": 1.7305788796048272e-08, "loss": 0.2973, "step": 3326 }, { "epoch": 0.94, "grad_norm": 2.703243585335758, "learning_rate": 1.7136195802078478e-08, "loss": 0.2707, "step": 3327 }, { "epoch": 0.94, "grad_norm": 2.543425586558075, "learning_rate": 1.6967430710731258e-08, "loss": 0.2497, "step": 3328 }, { "epoch": 0.94, "grad_norm": 2.214936036687701, "learning_rate": 1.6799493664163668e-08, "loss": 0.2467, "step": 3329 }, { "epoch": 0.94, "grad_norm": 2.394052290637097, "learning_rate": 1.6632384803835332e-08, "loss": 0.257, "step": 3330 }, { "epoch": 0.94, "grad_norm": 2.4616826296848093, "learning_rate": 1.6466104270508098e-08, "loss": 0.235, "step": 3331 }, { "epoch": 0.94, "grad_norm": 2.2261144395575623, "learning_rate": 1.6300652204246255e-08, "loss": 0.2369, "step": 3332 }, { "epoch": 0.94, "grad_norm": 2.3220778215462743, "learning_rate": 1.6136028744416218e-08, "loss": 0.2586, "step": 3333 }, { "epoch": 0.94, "grad_norm": 2.332569555366564, "learning_rate": 1.5972234029686616e-08, "loss": 0.2727, "step": 3334 }, { "epoch": 0.94, "grad_norm": 2.43062826390679, "learning_rate": 1.5809268198027524e-08, "loss": 0.2972, "step": 3335 }, { "epoch": 0.95, "grad_norm": 2.4845491954069296, "learning_rate": 1.5647131386711367e-08, "loss": 0.2835, "step": 3336 }, { "epoch": 0.95, "grad_norm": 2.593439289006337, "learning_rate": 1.5485823732311775e-08, "loss": 0.2638, "step": 3337 }, { "epoch": 0.95, "grad_norm": 2.473215011505199, "learning_rate": 1.532534537070429e-08, "loss": 0.2811, "step": 3338 }, { "epoch": 0.95, "grad_norm": 2.256085966417221, "learning_rate": 1.516569643706578e-08, "loss": 0.2546, "step": 3339 }, { "epoch": 0.95, "grad_norm": 2.4258033040572258, "learning_rate": 1.5006877065874335e-08, "loss": 0.2915, "step": 3340 }, { "epoch": 0.95, "grad_norm": 2.268360402644574, "learning_rate": 1.4848887390909614e-08, "loss": 0.2953, "step": 3341 }, { "epoch": 0.95, "grad_norm": 2.4874133228590356, "learning_rate": 1.4691727545251942e-08, "loss": 0.2913, "step": 3342 }, { "epoch": 0.95, "grad_norm": 2.202250492491265, "learning_rate": 1.4535397661283089e-08, "loss": 0.2529, "step": 3343 }, { "epoch": 0.95, "grad_norm": 2.343475727802179, "learning_rate": 1.4379897870685498e-08, "loss": 0.2372, "step": 3344 }, { "epoch": 0.95, "grad_norm": 2.4606844418448905, "learning_rate": 1.4225228304442172e-08, "loss": 0.2593, "step": 3345 }, { "epoch": 0.95, "grad_norm": 2.3663621030257858, "learning_rate": 1.4071389092837338e-08, "loss": 0.2785, "step": 3346 }, { "epoch": 0.95, "grad_norm": 2.2640297556060873, "learning_rate": 1.3918380365455228e-08, "loss": 0.2446, "step": 3347 }, { "epoch": 0.95, "grad_norm": 2.20404868372011, "learning_rate": 1.3766202251180858e-08, "loss": 0.2756, "step": 3348 }, { "epoch": 0.95, "grad_norm": 2.1297206085618754, "learning_rate": 1.3614854878199577e-08, "loss": 0.2259, "step": 3349 }, { "epoch": 0.95, "grad_norm": 2.4107497897973005, "learning_rate": 1.3464338373996741e-08, "loss": 0.2662, "step": 3350 }, { "epoch": 0.95, "grad_norm": 2.581593802277168, "learning_rate": 1.3314652865358156e-08, "loss": 0.3059, "step": 3351 }, { "epoch": 0.95, "grad_norm": 2.308818850381563, "learning_rate": 1.3165798478369183e-08, "loss": 0.2747, "step": 3352 }, { "epoch": 0.95, "grad_norm": 2.2932563254591054, "learning_rate": 1.3017775338415638e-08, "loss": 0.2804, "step": 3353 }, { "epoch": 0.95, "grad_norm": 2.525942716214574, "learning_rate": 1.287058357018278e-08, "loss": 0.2755, "step": 3354 }, { "epoch": 0.95, "grad_norm": 2.4985923891428476, "learning_rate": 1.2724223297655878e-08, "loss": 0.3062, "step": 3355 }, { "epoch": 0.95, "grad_norm": 2.356662166585638, "learning_rate": 1.2578694644119425e-08, "loss": 0.2611, "step": 3356 }, { "epoch": 0.95, "grad_norm": 2.3775670454411615, "learning_rate": 1.2433997732157586e-08, "loss": 0.2657, "step": 3357 }, { "epoch": 0.95, "grad_norm": 2.050051946828104, "learning_rate": 1.2290132683654086e-08, "loss": 0.2086, "step": 3358 }, { "epoch": 0.95, "grad_norm": 2.5458546624396257, "learning_rate": 1.2147099619791767e-08, "loss": 0.2822, "step": 3359 }, { "epoch": 0.95, "grad_norm": 2.415412671153464, "learning_rate": 1.2004898661052588e-08, "loss": 0.2783, "step": 3360 }, { "epoch": 0.95, "grad_norm": 2.2751172201041, "learning_rate": 1.186352992721773e-08, "loss": 0.2502, "step": 3361 }, { "epoch": 0.95, "grad_norm": 2.3132166812169714, "learning_rate": 1.1722993537367277e-08, "loss": 0.2829, "step": 3362 }, { "epoch": 0.95, "grad_norm": 2.5768434717911473, "learning_rate": 1.1583289609880308e-08, "loss": 0.2876, "step": 3363 }, { "epoch": 0.95, "grad_norm": 2.3452615889347994, "learning_rate": 1.1444418262434586e-08, "loss": 0.2531, "step": 3364 }, { "epoch": 0.95, "grad_norm": 2.4335383089249945, "learning_rate": 1.1306379612006645e-08, "loss": 0.2877, "step": 3365 }, { "epoch": 0.95, "grad_norm": 2.2737448478494238, "learning_rate": 1.1169173774871477e-08, "loss": 0.2422, "step": 3366 }, { "epoch": 0.95, "grad_norm": 2.3643371162101996, "learning_rate": 1.1032800866602632e-08, "loss": 0.2475, "step": 3367 }, { "epoch": 0.95, "grad_norm": 2.308016964987461, "learning_rate": 1.0897261002072222e-08, "loss": 0.2385, "step": 3368 }, { "epoch": 0.95, "grad_norm": 2.396351551587896, "learning_rate": 1.0762554295450366e-08, "loss": 0.2635, "step": 3369 }, { "epoch": 0.95, "grad_norm": 2.4150045202475745, "learning_rate": 1.0628680860205518e-08, "loss": 0.2565, "step": 3370 }, { "epoch": 0.96, "grad_norm": 2.2957529859487034, "learning_rate": 1.0495640809104256e-08, "loss": 0.2808, "step": 3371 }, { "epoch": 0.96, "grad_norm": 2.441843889242134, "learning_rate": 1.0363434254211268e-08, "loss": 0.2541, "step": 3372 }, { "epoch": 0.96, "grad_norm": 2.4618524049071153, "learning_rate": 1.0232061306888917e-08, "loss": 0.2706, "step": 3373 }, { "epoch": 0.96, "grad_norm": 2.5357395909750866, "learning_rate": 1.0101522077797352e-08, "loss": 0.2733, "step": 3374 }, { "epoch": 0.96, "grad_norm": 2.619183803188383, "learning_rate": 9.97181667689495e-09, "loss": 0.2894, "step": 3375 }, { "epoch": 0.96, "grad_norm": 2.534613440279317, "learning_rate": 9.842945213437092e-09, "loss": 0.2671, "step": 3376 }, { "epoch": 0.96, "grad_norm": 2.230444791490229, "learning_rate": 9.714907795977168e-09, "loss": 0.2841, "step": 3377 }, { "epoch": 0.96, "grad_norm": 2.7079530309001902, "learning_rate": 9.587704532365681e-09, "loss": 0.2649, "step": 3378 }, { "epoch": 0.96, "grad_norm": 2.4746543060966806, "learning_rate": 9.461335529750814e-09, "loss": 0.2536, "step": 3379 }, { "epoch": 0.96, "grad_norm": 2.474784362397472, "learning_rate": 9.33580089457786e-09, "loss": 0.2703, "step": 3380 }, { "epoch": 0.96, "grad_norm": 2.449321332065921, "learning_rate": 9.211100732589127e-09, "loss": 0.278, "step": 3381 }, { "epoch": 0.96, "grad_norm": 2.3510232296452678, "learning_rate": 9.087235148824368e-09, "loss": 0.2727, "step": 3382 }, { "epoch": 0.96, "grad_norm": 2.2950727777946143, "learning_rate": 8.964204247620011e-09, "loss": 0.2902, "step": 3383 }, { "epoch": 0.96, "grad_norm": 2.4002683065846058, "learning_rate": 8.842008132609602e-09, "loss": 0.2745, "step": 3384 }, { "epoch": 0.96, "grad_norm": 2.462639292571834, "learning_rate": 8.720646906723583e-09, "loss": 0.2739, "step": 3385 }, { "epoch": 0.96, "grad_norm": 2.419294993393598, "learning_rate": 8.600120672188738e-09, "loss": 0.2354, "step": 3386 }, { "epoch": 0.96, "grad_norm": 2.2659319854149955, "learning_rate": 8.480429530529076e-09, "loss": 0.2479, "step": 3387 }, { "epoch": 0.96, "grad_norm": 4.2481201180611, "learning_rate": 8.361573582564729e-09, "loss": 0.2959, "step": 3388 }, { "epoch": 0.96, "grad_norm": 2.3327587692717473, "learning_rate": 8.2435529284125e-09, "loss": 0.2548, "step": 3389 }, { "epoch": 0.96, "grad_norm": 3.024764248268901, "learning_rate": 8.126367667485534e-09, "loss": 0.291, "step": 3390 }, { "epoch": 0.96, "grad_norm": 2.133213647977136, "learning_rate": 8.010017898493315e-09, "loss": 0.229, "step": 3391 }, { "epoch": 0.96, "grad_norm": 2.2073354750539815, "learning_rate": 7.89450371944167e-09, "loss": 0.2341, "step": 3392 }, { "epoch": 0.96, "grad_norm": 2.3459203903671693, "learning_rate": 7.779825227632319e-09, "loss": 0.2735, "step": 3393 }, { "epoch": 0.96, "grad_norm": 2.2982394761984124, "learning_rate": 7.665982519663327e-09, "loss": 0.2733, "step": 3394 }, { "epoch": 0.96, "grad_norm": 2.4779060772165153, "learning_rate": 7.552975691428654e-09, "loss": 0.2992, "step": 3395 }, { "epoch": 0.96, "grad_norm": 2.3653518625493897, "learning_rate": 7.440804838117931e-09, "loss": 0.2544, "step": 3396 }, { "epoch": 0.96, "grad_norm": 2.3191375340363085, "learning_rate": 7.329470054217024e-09, "loss": 0.2596, "step": 3397 }, { "epoch": 0.96, "grad_norm": 2.233643860181891, "learning_rate": 7.21897143350747e-09, "loss": 0.2291, "step": 3398 }, { "epoch": 0.96, "grad_norm": 3.645260025143113, "learning_rate": 7.109309069065928e-09, "loss": 0.2604, "step": 3399 }, { "epoch": 0.96, "grad_norm": 2.2409004059704185, "learning_rate": 7.000483053265505e-09, "loss": 0.264, "step": 3400 }, { "epoch": 0.96, "grad_norm": 2.0328585106470216, "learning_rate": 6.892493477774097e-09, "loss": 0.245, "step": 3401 }, { "epoch": 0.96, "grad_norm": 2.4964028149676083, "learning_rate": 6.7853404335554974e-09, "loss": 0.2678, "step": 3402 }, { "epoch": 0.96, "grad_norm": 2.558672079681962, "learning_rate": 6.679024010868617e-09, "loss": 0.3041, "step": 3403 }, { "epoch": 0.96, "grad_norm": 2.40141182727304, "learning_rate": 6.573544299267708e-09, "loss": 0.2448, "step": 3404 }, { "epoch": 0.96, "grad_norm": 2.516652123088787, "learning_rate": 6.468901387602366e-09, "loss": 0.3198, "step": 3405 }, { "epoch": 0.97, "grad_norm": 2.3091424744946814, "learning_rate": 6.36509536401697e-09, "loss": 0.2587, "step": 3406 }, { "epoch": 0.97, "grad_norm": 2.3951209136151093, "learning_rate": 6.262126315951355e-09, "loss": 0.2814, "step": 3407 }, { "epoch": 0.97, "grad_norm": 2.313803926109103, "learning_rate": 6.159994330140139e-09, "loss": 0.2694, "step": 3408 }, { "epoch": 0.97, "grad_norm": 2.5797502248297772, "learning_rate": 6.0586994926128396e-09, "loss": 0.2871, "step": 3409 }, { "epoch": 0.97, "grad_norm": 2.359256899842382, "learning_rate": 5.958241888693871e-09, "loss": 0.2953, "step": 3410 }, { "epoch": 0.97, "grad_norm": 2.35053897730414, "learning_rate": 5.858621603002434e-09, "loss": 0.2837, "step": 3411 }, { "epoch": 0.97, "grad_norm": 2.329705678442523, "learning_rate": 5.7598387194524035e-09, "loss": 0.2842, "step": 3412 }, { "epoch": 0.97, "grad_norm": 2.2477466985107877, "learning_rate": 5.66189332125222e-09, "loss": 0.2917, "step": 3413 }, { "epoch": 0.97, "grad_norm": 2.255582150057296, "learning_rate": 5.564785490904778e-09, "loss": 0.2669, "step": 3414 }, { "epoch": 0.97, "grad_norm": 2.3365220893647405, "learning_rate": 5.468515310207866e-09, "loss": 0.2792, "step": 3415 }, { "epoch": 0.97, "grad_norm": 2.489205245133401, "learning_rate": 5.373082860253286e-09, "loss": 0.2997, "step": 3416 }, { "epoch": 0.97, "grad_norm": 2.290026567655139, "learning_rate": 5.278488221427402e-09, "loss": 0.2529, "step": 3417 }, { "epoch": 0.97, "grad_norm": 2.2195804476343555, "learning_rate": 5.184731473410697e-09, "loss": 0.2438, "step": 3418 }, { "epoch": 0.97, "grad_norm": 2.440156596286958, "learning_rate": 5.0918126951779995e-09, "loss": 0.2575, "step": 3419 }, { "epoch": 0.97, "grad_norm": 2.277947551672406, "learning_rate": 4.999731964998255e-09, "loss": 0.2365, "step": 3420 }, { "epoch": 0.97, "grad_norm": 2.1878020075418747, "learning_rate": 4.90848936043442e-09, "loss": 0.2642, "step": 3421 }, { "epoch": 0.97, "grad_norm": 2.3846931366612654, "learning_rate": 4.818084958343571e-09, "loss": 0.2614, "step": 3422 }, { "epoch": 0.97, "grad_norm": 2.3490608640052697, "learning_rate": 4.728518834876683e-09, "loss": 0.2779, "step": 3423 }, { "epoch": 0.97, "grad_norm": 2.631539271095303, "learning_rate": 4.639791065478737e-09, "loss": 0.2739, "step": 3424 }, { "epoch": 0.97, "grad_norm": 2.2169767571560963, "learning_rate": 4.551901724888063e-09, "loss": 0.2424, "step": 3425 }, { "epoch": 0.97, "grad_norm": 2.5267588387109967, "learning_rate": 4.46485088713755e-09, "loss": 0.2745, "step": 3426 }, { "epoch": 0.97, "grad_norm": 2.516514525166744, "learning_rate": 4.378638625553099e-09, "loss": 0.2924, "step": 3427 }, { "epoch": 0.97, "grad_norm": 2.8289199823796185, "learning_rate": 4.29326501275451e-09, "loss": 0.2984, "step": 3428 }, { "epoch": 0.97, "grad_norm": 2.37941989685982, "learning_rate": 4.208730120655257e-09, "loss": 0.272, "step": 3429 }, { "epoch": 0.97, "grad_norm": 2.3135649095456374, "learning_rate": 4.125034020461937e-09, "loss": 0.2414, "step": 3430 }, { "epoch": 0.97, "grad_norm": 2.343939860797385, "learning_rate": 4.042176782675266e-09, "loss": 0.2724, "step": 3431 }, { "epoch": 0.97, "grad_norm": 2.89142975164576, "learning_rate": 3.9601584770887485e-09, "loss": 0.2475, "step": 3432 }, { "epoch": 0.97, "grad_norm": 2.4270415644781242, "learning_rate": 3.878979172789454e-09, "loss": 0.2627, "step": 3433 }, { "epoch": 0.97, "grad_norm": 2.2244448517637934, "learning_rate": 3.798638938157683e-09, "loss": 0.2423, "step": 3434 }, { "epoch": 0.97, "grad_norm": 2.5572297378768036, "learning_rate": 3.7191378408670817e-09, "loss": 0.2761, "step": 3435 }, { "epoch": 0.97, "grad_norm": 2.4173689424138356, "learning_rate": 3.640475947884303e-09, "loss": 0.2652, "step": 3436 }, { "epoch": 0.97, "grad_norm": 2.3181700082518804, "learning_rate": 3.562653325469345e-09, "loss": 0.2339, "step": 3437 }, { "epoch": 0.97, "grad_norm": 2.358757031542546, "learning_rate": 3.4856700391748817e-09, "loss": 0.2793, "step": 3438 }, { "epoch": 0.97, "grad_norm": 2.4888349477689635, "learning_rate": 3.40952615384682e-09, "loss": 0.2668, "step": 3439 }, { "epoch": 0.97, "grad_norm": 2.4841252669275717, "learning_rate": 3.3342217336239653e-09, "loss": 0.2661, "step": 3440 }, { "epoch": 0.97, "grad_norm": 2.535579975411954, "learning_rate": 3.2597568419382437e-09, "loss": 0.2823, "step": 3441 }, { "epoch": 0.98, "grad_norm": 2.446986754997376, "learning_rate": 3.1861315415139257e-09, "loss": 0.2666, "step": 3442 }, { "epoch": 0.98, "grad_norm": 2.304931837380515, "learning_rate": 3.113345894368402e-09, "loss": 0.2594, "step": 3443 }, { "epoch": 0.98, "grad_norm": 2.359112779339876, "learning_rate": 3.0413999618117415e-09, "loss": 0.2738, "step": 3444 }, { "epoch": 0.98, "grad_norm": 2.3442231737629338, "learning_rate": 2.9702938044467994e-09, "loss": 0.2618, "step": 3445 }, { "epoch": 0.98, "grad_norm": 2.872008136124653, "learning_rate": 2.9000274821687765e-09, "loss": 0.2378, "step": 3446 }, { "epoch": 0.98, "grad_norm": 2.393992867182129, "learning_rate": 2.830601054165549e-09, "loss": 0.2688, "step": 3447 }, { "epoch": 0.98, "grad_norm": 2.4955852552862514, "learning_rate": 2.7620145789177816e-09, "loss": 0.3084, "step": 3448 }, { "epoch": 0.98, "grad_norm": 2.394092561435307, "learning_rate": 2.6942681141981506e-09, "loss": 0.2509, "step": 3449 }, { "epoch": 0.98, "grad_norm": 2.2975660397285913, "learning_rate": 2.6273617170722295e-09, "loss": 0.2669, "step": 3450 }, { "epoch": 0.98, "grad_norm": 2.2922318671409196, "learning_rate": 2.5612954438977154e-09, "loss": 0.2503, "step": 3451 }, { "epoch": 0.98, "grad_norm": 2.271404899827809, "learning_rate": 2.4960693503245367e-09, "loss": 0.2689, "step": 3452 }, { "epoch": 0.98, "grad_norm": 2.173109962410258, "learning_rate": 2.4316834912951887e-09, "loss": 0.2373, "step": 3453 }, { "epoch": 0.98, "grad_norm": 3.277192362094739, "learning_rate": 2.3681379210442885e-09, "loss": 0.255, "step": 3454 }, { "epoch": 0.98, "grad_norm": 2.419018625423423, "learning_rate": 2.3054326930984636e-09, "loss": 0.2867, "step": 3455 }, { "epoch": 0.98, "grad_norm": 2.704302423417181, "learning_rate": 2.243567860276796e-09, "loss": 0.296, "step": 3456 }, { "epoch": 0.98, "grad_norm": 2.397799847013115, "learning_rate": 2.1825434746903793e-09, "loss": 0.2527, "step": 3457 }, { "epoch": 0.98, "grad_norm": 2.35412657731211, "learning_rate": 2.1223595877420953e-09, "loss": 0.2718, "step": 3458 }, { "epoch": 0.98, "grad_norm": 2.6890571841517277, "learning_rate": 2.0630162501272806e-09, "loss": 0.2542, "step": 3459 }, { "epoch": 0.98, "grad_norm": 2.438948016197178, "learning_rate": 2.0045135118328394e-09, "loss": 0.2801, "step": 3460 }, { "epoch": 0.98, "grad_norm": 2.267259101241706, "learning_rate": 1.946851422138018e-09, "loss": 0.2311, "step": 3461 }, { "epoch": 0.98, "grad_norm": 2.551728847298024, "learning_rate": 1.890030029613521e-09, "loss": 0.2799, "step": 3462 }, { "epoch": 0.98, "grad_norm": 2.479703785478036, "learning_rate": 1.8340493821222824e-09, "loss": 0.2788, "step": 3463 }, { "epoch": 0.98, "grad_norm": 2.6460667172496315, "learning_rate": 1.7789095268188058e-09, "loss": 0.2908, "step": 3464 }, { "epoch": 0.98, "grad_norm": 2.5203858422375642, "learning_rate": 1.7246105101493825e-09, "loss": 0.3041, "step": 3465 }, { "epoch": 0.98, "grad_norm": 2.2142928954216696, "learning_rate": 1.671152377852092e-09, "loss": 0.2428, "step": 3466 }, { "epoch": 0.98, "grad_norm": 2.265712772154652, "learning_rate": 1.6185351749569142e-09, "loss": 0.259, "step": 3467 }, { "epoch": 0.98, "grad_norm": 2.2995218830476962, "learning_rate": 1.5667589457849516e-09, "loss": 0.2413, "step": 3468 }, { "epoch": 0.98, "grad_norm": 3.1040150036059075, "learning_rate": 1.5158237339494283e-09, "loss": 0.234, "step": 3469 }, { "epoch": 0.98, "grad_norm": 2.3875856781933744, "learning_rate": 1.4657295823549132e-09, "loss": 0.2668, "step": 3470 }, { "epoch": 0.98, "grad_norm": 2.462460581054891, "learning_rate": 1.4164765331976525e-09, "loss": 0.2631, "step": 3471 }, { "epoch": 0.98, "grad_norm": 2.4861816892269157, "learning_rate": 1.3680646279651265e-09, "loss": 0.2766, "step": 3472 }, { "epoch": 0.98, "grad_norm": 2.455987767118866, "learning_rate": 1.320493907436604e-09, "loss": 0.2293, "step": 3473 }, { "epoch": 0.98, "grad_norm": 2.7477765197169726, "learning_rate": 1.2737644116826985e-09, "loss": 0.305, "step": 3474 }, { "epoch": 0.98, "grad_norm": 2.2249571672813317, "learning_rate": 1.227876180065368e-09, "loss": 0.258, "step": 3475 }, { "epoch": 0.98, "grad_norm": 2.2543225771684656, "learning_rate": 1.1828292512380267e-09, "loss": 0.2526, "step": 3476 }, { "epoch": 0.99, "grad_norm": 2.3257233204429535, "learning_rate": 1.1386236631452107e-09, "loss": 0.2569, "step": 3477 }, { "epoch": 0.99, "grad_norm": 2.5350058521661056, "learning_rate": 1.095259453023023e-09, "loss": 0.2867, "step": 3478 }, { "epoch": 0.99, "grad_norm": 2.4593700491079966, "learning_rate": 1.0527366573986895e-09, "loss": 0.2921, "step": 3479 }, { "epoch": 0.99, "grad_norm": 2.2846641331816384, "learning_rate": 1.0110553120908915e-09, "loss": 0.2572, "step": 3480 }, { "epoch": 0.99, "grad_norm": 2.3702436683714185, "learning_rate": 9.70215452209211e-10, "loss": 0.2596, "step": 3481 }, { "epoch": 0.99, "grad_norm": 2.392817209094412, "learning_rate": 9.302171121546853e-10, "loss": 0.2476, "step": 3482 }, { "epoch": 0.99, "grad_norm": 2.616727121844519, "learning_rate": 8.910603256192529e-10, "loss": 0.2712, "step": 3483 }, { "epoch": 0.99, "grad_norm": 2.5477779958428575, "learning_rate": 8.527451255863071e-10, "loss": 0.2786, "step": 3484 }, { "epoch": 0.99, "grad_norm": 2.42618092930159, "learning_rate": 8.152715443300318e-10, "loss": 0.2721, "step": 3485 }, { "epoch": 0.99, "grad_norm": 2.5225716230160917, "learning_rate": 7.786396134158435e-10, "loss": 0.2834, "step": 3486 }, { "epoch": 0.99, "grad_norm": 2.614699044791809, "learning_rate": 7.42849363700282e-10, "loss": 0.2844, "step": 3487 }, { "epoch": 0.99, "grad_norm": 2.5011157795190915, "learning_rate": 7.079008253306762e-10, "loss": 0.2605, "step": 3488 }, { "epoch": 0.99, "grad_norm": 2.493778563490711, "learning_rate": 6.737940277454778e-10, "loss": 0.2774, "step": 3489 }, { "epoch": 0.99, "grad_norm": 2.257688702699588, "learning_rate": 6.405289996741503e-10, "loss": 0.2579, "step": 3490 }, { "epoch": 0.99, "grad_norm": 2.427775485003037, "learning_rate": 6.081057691370572e-10, "loss": 0.2593, "step": 3491 }, { "epoch": 0.99, "grad_norm": 2.2505185636951026, "learning_rate": 5.76524363445463e-10, "loss": 0.2815, "step": 3492 }, { "epoch": 0.99, "grad_norm": 2.4718683751201636, "learning_rate": 5.457848092015327e-10, "loss": 0.2612, "step": 3493 }, { "epoch": 0.99, "grad_norm": 2.3198523130298656, "learning_rate": 5.158871322984426e-10, "loss": 0.2698, "step": 3494 }, { "epoch": 0.99, "grad_norm": 2.232091467187299, "learning_rate": 4.868313579200479e-10, "loss": 0.2533, "step": 3495 }, { "epoch": 0.99, "grad_norm": 2.4747398852089417, "learning_rate": 4.5861751054110385e-10, "loss": 0.2753, "step": 3496 }, { "epoch": 0.99, "grad_norm": 2.5764286483014556, "learning_rate": 4.3124561392715584e-10, "loss": 0.2567, "step": 3497 }, { "epoch": 0.99, "grad_norm": 2.367029999731487, "learning_rate": 4.047156911345384e-10, "loss": 0.2753, "step": 3498 }, { "epoch": 0.99, "grad_norm": 2.440774884458288, "learning_rate": 3.7902776451048667e-10, "loss": 0.2955, "step": 3499 }, { "epoch": 0.99, "grad_norm": 2.2906056439515177, "learning_rate": 3.5418185569280337e-10, "loss": 0.2582, "step": 3500 }, { "epoch": 0.99, "grad_norm": 2.2754598342103325, "learning_rate": 3.3017798561030266e-10, "loss": 0.2427, "step": 3501 }, { "epoch": 0.99, "grad_norm": 2.401165101202187, "learning_rate": 3.070161744820332e-10, "loss": 0.2716, "step": 3502 }, { "epoch": 0.99, "grad_norm": 2.1729922241238833, "learning_rate": 2.846964418182773e-10, "loss": 0.2476, "step": 3503 }, { "epoch": 0.99, "grad_norm": 2.2677043074356003, "learning_rate": 2.632188064196628e-10, "loss": 0.2563, "step": 3504 }, { "epoch": 0.99, "grad_norm": 2.449392876606644, "learning_rate": 2.4258328637771776e-10, "loss": 0.2601, "step": 3505 }, { "epoch": 0.99, "grad_norm": 2.3055918692627793, "learning_rate": 2.22789899074427e-10, "loss": 0.2693, "step": 3506 }, { "epoch": 0.99, "grad_norm": 2.4329321525802556, "learning_rate": 2.0383866118245385e-10, "loss": 0.2406, "step": 3507 }, { "epoch": 0.99, "grad_norm": 2.2791612559079786, "learning_rate": 1.8572958866514e-10, "loss": 0.2761, "step": 3508 }, { "epoch": 0.99, "grad_norm": 2.560549908458607, "learning_rate": 1.684626967765057e-10, "loss": 0.235, "step": 3509 }, { "epoch": 0.99, "grad_norm": 2.5078118323749727, "learning_rate": 1.5203800006102774e-10, "loss": 0.3003, "step": 3510 }, { "epoch": 0.99, "grad_norm": 2.493354960056434, "learning_rate": 1.3645551235386133e-10, "loss": 0.2504, "step": 3511 }, { "epoch": 1.0, "grad_norm": 2.762856738714792, "learning_rate": 1.2171524678061818e-10, "loss": 0.2951, "step": 3512 }, { "epoch": 1.0, "grad_norm": 2.4196022508390054, "learning_rate": 1.0781721575781056e-10, "loss": 0.2877, "step": 3513 }, { "epoch": 1.0, "grad_norm": 2.1964914531229427, "learning_rate": 9.476143099207412e-11, "loss": 0.2594, "step": 3514 }, { "epoch": 1.0, "grad_norm": 2.3678613204157104, "learning_rate": 8.254790348072304e-11, "loss": 0.2815, "step": 3515 }, { "epoch": 1.0, "grad_norm": 2.3026631969295814, "learning_rate": 7.117664351186103e-11, "loss": 0.2821, "step": 3516 }, { "epoch": 1.0, "grad_norm": 2.560841032353236, "learning_rate": 6.06476606638262e-11, "loss": 0.2817, "step": 3517 }, { "epoch": 1.0, "grad_norm": 2.358566543205405, "learning_rate": 5.096096380552417e-11, "loss": 0.2727, "step": 3518 }, { "epoch": 1.0, "grad_norm": 2.47453393582181, "learning_rate": 4.211656109642803e-11, "loss": 0.2822, "step": 3519 }, { "epoch": 1.0, "grad_norm": 2.1677384205058012, "learning_rate": 3.411445998668938e-11, "loss": 0.2386, "step": 3520 }, { "epoch": 1.0, "grad_norm": 2.405772186235948, "learning_rate": 2.6954667216472217e-11, "loss": 0.2798, "step": 3521 }, { "epoch": 1.0, "grad_norm": 2.4163648834715747, "learning_rate": 2.063718881695209e-11, "loss": 0.2753, "step": 3522 }, { "epoch": 1.0, "grad_norm": 2.1951499438686906, "learning_rate": 1.516203010953898e-11, "loss": 0.2416, "step": 3523 }, { "epoch": 1.0, "grad_norm": 2.5347777261578788, "learning_rate": 1.0529195706099337e-11, "loss": 0.3174, "step": 3524 }, { "epoch": 1.0, "grad_norm": 2.303176321896944, "learning_rate": 6.738689509067086e-12, "loss": 0.2859, "step": 3525 }, { "epoch": 1.0, "grad_norm": 2.253837882658441, "learning_rate": 3.790514711332626e-12, "loss": 0.2613, "step": 3526 }, { "epoch": 1.0, "grad_norm": 2.5952689897785053, "learning_rate": 1.6846737963538415e-12, "loss": 0.2775, "step": 3527 }, { "epoch": 1.0, "grad_norm": 2.359674481777684, "learning_rate": 4.211685378230356e-13, "loss": 0.2754, "step": 3528 }, { "epoch": 1.0, "grad_norm": 2.4963844359266916, "learning_rate": 0.0, "loss": 0.2803, "step": 3529 }, { "epoch": 1.0, "step": 3529, "total_flos": 696559979986944.0, "train_loss": 0.3025049172588839, "train_runtime": 35270.0898, "train_samples_per_second": 3.202, "train_steps_per_second": 0.1 } ], "logging_steps": 1.0, "max_steps": 3529, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25000, "total_flos": 696559979986944.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }