|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5845389449072045, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001169077889814409, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 3.898635477582846e-07, |
|
"loss": 5.0851, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.002338155779628818, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 7.797270955165692e-07, |
|
"loss": 5.1921, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0035072336694432268, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 1.1695906432748538e-06, |
|
"loss": 5.0756, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004676311559257636, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 1.5594541910331385e-06, |
|
"loss": 5.1097, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005845389449072045, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 1.949317738791423e-06, |
|
"loss": 5.0591, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0070144673388864535, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 2.3391812865497075e-06, |
|
"loss": 5.096, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.008183545228700862, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 2.729044834307992e-06, |
|
"loss": 5.0234, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.009352623118515271, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.118908382066277e-06, |
|
"loss": 5.0266, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01052170100832968, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.5087719298245615e-06, |
|
"loss": 5.031, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01169077889814409, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.898635477582846e-06, |
|
"loss": 4.9845, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012859856787958498, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.2884990253411305e-06, |
|
"loss": 4.9933, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.014028934677772907, |
|
"grad_norm": 4.625, |
|
"learning_rate": 4.678362573099415e-06, |
|
"loss": 5.0822, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.015198012567587316, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 5.0682261208576995e-06, |
|
"loss": 4.9392, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.016367090457401725, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 5.458089668615984e-06, |
|
"loss": 5.0441, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.017536168347216132, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 5.8479532163742686e-06, |
|
"loss": 4.8865, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018705246237030543, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 6.237816764132554e-06, |
|
"loss": 4.9227, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.01987432412684495, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 6.6276803118908384e-06, |
|
"loss": 4.9035, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02104340201665936, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 7.017543859649123e-06, |
|
"loss": 4.863, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.022212479906473768, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 4.7735, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02338155779628818, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 7.797270955165692e-06, |
|
"loss": 4.7932, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.024550635686102586, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 8.187134502923977e-06, |
|
"loss": 4.6933, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.025719713575916996, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 8.576998050682261e-06, |
|
"loss": 4.5906, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.026888791465731404, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 8.966861598440546e-06, |
|
"loss": 4.4291, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.028057869355545814, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 9.35672514619883e-06, |
|
"loss": 4.221, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.02922694724536022, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 9.746588693957115e-06, |
|
"loss": 4.032, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.030396025135174632, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.0136452241715399e-05, |
|
"loss": 3.8603, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.03156510302498904, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 3.6522, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03273418091480345, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.0916179337231968e-05, |
|
"loss": 3.4394, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03390325880461786, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.1306042884990253e-05, |
|
"loss": 3.2265, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.035072336694432264, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.1695906432748537e-05, |
|
"loss": 3.091, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03624141458424668, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.2085769980506823e-05, |
|
"loss": 2.8856, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.037410492474061086, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.2475633528265108e-05, |
|
"loss": 2.6299, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03857957036387549, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1.2865497076023392e-05, |
|
"loss": 2.5917, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0397486482536899, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.3255360623781677e-05, |
|
"loss": 2.3757, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.040917726143504314, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.364522417153996e-05, |
|
"loss": 2.308, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04208680403331872, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.4035087719298246e-05, |
|
"loss": 2.3042, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04325588192313313, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.442495126705653e-05, |
|
"loss": 2.2149, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.044424959812947536, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 2.0738, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.04559403770276195, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.5204678362573099e-05, |
|
"loss": 1.9663, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.04676311559257636, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.5594541910331384e-05, |
|
"loss": 1.9384, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.047932193482390764, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.5984405458089668e-05, |
|
"loss": 1.894, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.04910127137220517, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.6374269005847955e-05, |
|
"loss": 1.8204, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.050270349262019585, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.676413255360624e-05, |
|
"loss": 1.8051, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05143942715183399, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.7153996101364522e-05, |
|
"loss": 1.5832, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.0526085050416484, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.7543859649122806e-05, |
|
"loss": 1.5298, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05377758293146281, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.7933723196881093e-05, |
|
"loss": 1.4964, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.054946660821277214, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.8323586744639376e-05, |
|
"loss": 1.3987, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.05611573871109163, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.871345029239766e-05, |
|
"loss": 1.3959, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.057284816600906036, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.9103313840155944e-05, |
|
"loss": 1.3673, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.05845389449072044, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.949317738791423e-05, |
|
"loss": 1.2864, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05962297238053485, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.9883040935672515e-05, |
|
"loss": 1.2535, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.060792050270349264, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 2.0272904483430798e-05, |
|
"loss": 1.2242, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.06196112816016367, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.0662768031189085e-05, |
|
"loss": 1.1936, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.06313020604997809, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 2.105263157894737e-05, |
|
"loss": 1.1858, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.06429928393979249, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.1442495126705653e-05, |
|
"loss": 1.1438, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0654683618296069, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.1832358674463936e-05, |
|
"loss": 1.1217, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.06663743971942131, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 1.1508, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.06780651760923571, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.2612085769980507e-05, |
|
"loss": 1.0976, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.06897559549905012, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.300194931773879e-05, |
|
"loss": 1.0716, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.07014467338886453, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.3391812865497074e-05, |
|
"loss": 1.0152, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07131375127867894, |
|
"grad_norm": 3.125, |
|
"learning_rate": 2.378167641325536e-05, |
|
"loss": 1.0265, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.07248282916849336, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 2.4171539961013645e-05, |
|
"loss": 1.2355, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.07365190705830776, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.456140350877193e-05, |
|
"loss": 1.0111, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.07482098494812217, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.4951267056530216e-05, |
|
"loss": 1.2317, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.07599006283793658, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.53411306042885e-05, |
|
"loss": 0.9988, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07715914072775099, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.5730994152046783e-05, |
|
"loss": 1.0076, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.07832821861756539, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.6120857699805067e-05, |
|
"loss": 1.0155, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.0794972965073798, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.6510721247563354e-05, |
|
"loss": 1.1216, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.08066637439719421, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.6900584795321637e-05, |
|
"loss": 0.9788, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.08183545228700863, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 2.729044834307992e-05, |
|
"loss": 0.9372, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08300453017682304, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.7680311890838205e-05, |
|
"loss": 0.897, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.08417360806663744, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 2.8070175438596492e-05, |
|
"loss": 0.9766, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.08534268595645185, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 2.8460038986354776e-05, |
|
"loss": 0.9315, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.08651176384626626, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 2.884990253411306e-05, |
|
"loss": 0.9425, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.08768084173608066, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 2.9239766081871346e-05, |
|
"loss": 0.9242, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08884991962589507, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 0.9396, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.09001899751570948, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 3.0019493177387914e-05, |
|
"loss": 0.8932, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.0911880754055239, |
|
"grad_norm": 72.5, |
|
"learning_rate": 3.0409356725146197e-05, |
|
"loss": 0.9062, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.0923571532953383, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.0799220272904484e-05, |
|
"loss": 0.9534, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.09352623118515271, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 3.118908382066277e-05, |
|
"loss": 0.8715, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09469530907496712, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.157894736842105e-05, |
|
"loss": 0.9029, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.09586438696478153, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 3.1968810916179335e-05, |
|
"loss": 0.8968, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.09703346485459594, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 3.235867446393762e-05, |
|
"loss": 0.9784, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.09820254274441034, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.274853801169591e-05, |
|
"loss": 0.9012, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.09937162063422475, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.313840155945419e-05, |
|
"loss": 0.8895, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10054069852403917, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.352826510721248e-05, |
|
"loss": 0.8858, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.10170977641385358, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.391812865497076e-05, |
|
"loss": 0.8408, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.10287885430366799, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.4307992202729044e-05, |
|
"loss": 0.8896, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.10404793219348239, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.469785575048733e-05, |
|
"loss": 0.8941, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.1052170100832968, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.508771929824561e-05, |
|
"loss": 0.9013, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1063860879731112, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.5477582846003895e-05, |
|
"loss": 0.8585, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.10755516586292561, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.5867446393762186e-05, |
|
"loss": 1.0231, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.10872424375274002, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.625730994152047e-05, |
|
"loss": 0.8835, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.10989332164255443, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.664717348927875e-05, |
|
"loss": 0.8465, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.11106239953236885, |
|
"grad_norm": 2.75, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 1.1014, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11223147742218326, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.742690058479532e-05, |
|
"loss": 0.8691, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.11340055531199766, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 3.7816764132553604e-05, |
|
"loss": 0.8547, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.11456963320181207, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.820662768031189e-05, |
|
"loss": 0.8725, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.11573871109162648, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.859649122807018e-05, |
|
"loss": 0.8393, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.11690778898144089, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.898635477582846e-05, |
|
"loss": 0.8345, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11807686687125529, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.9376218323586745e-05, |
|
"loss": 0.8966, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.1192459447610697, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 3.976608187134503e-05, |
|
"loss": 0.8291, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.12041502265088412, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.015594541910331e-05, |
|
"loss": 0.8175, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.12158410054069853, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.0545808966861596e-05, |
|
"loss": 0.8353, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.12275317843051294, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.093567251461988e-05, |
|
"loss": 1.0084, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12392225632032734, |
|
"grad_norm": 59.75, |
|
"learning_rate": 4.132553606237817e-05, |
|
"loss": 0.8478, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.12509133421014176, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.1715399610136454e-05, |
|
"loss": 0.8267, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.12626041209995617, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.210526315789474e-05, |
|
"loss": 0.836, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.12742948998977058, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.249512670565302e-05, |
|
"loss": 0.857, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.12859856787958499, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.2884990253411305e-05, |
|
"loss": 0.8478, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1297676457693994, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.327485380116959e-05, |
|
"loss": 0.844, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.1309367236592138, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 4.366471734892787e-05, |
|
"loss": 0.8128, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.1321058015490282, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.4054580896686156e-05, |
|
"loss": 0.8276, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.13327487943884261, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.8478, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.13444395732865702, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.483430799220273e-05, |
|
"loss": 0.814, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13561303521847143, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.5224171539961014e-05, |
|
"loss": 0.765, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.13678211310828584, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.56140350877193e-05, |
|
"loss": 0.8277, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.13795119099810024, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.600389863547758e-05, |
|
"loss": 0.8103, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.13912026888791465, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 4.6393762183235865e-05, |
|
"loss": 0.7897, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.14028934677772906, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.678362573099415e-05, |
|
"loss": 0.7798, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.14145842466754346, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.717348927875244e-05, |
|
"loss": 0.8442, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.14262750255735787, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.756335282651072e-05, |
|
"loss": 0.7936, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.1437965804471723, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.7953216374269006e-05, |
|
"loss": 0.7726, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.1449656583369867, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.834307992202729e-05, |
|
"loss": 0.9626, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.14613473622680112, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.8732943469785574e-05, |
|
"loss": 0.7929, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14730381411661553, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.912280701754386e-05, |
|
"loss": 0.8589, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.14847289200642994, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.951267056530214e-05, |
|
"loss": 0.7383, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.14964196989624434, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.990253411306043e-05, |
|
"loss": 0.8011, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.15081104778605875, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.0292397660818715e-05, |
|
"loss": 0.8404, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.15198012567587316, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 5.0682261208577e-05, |
|
"loss": 0.8301, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15314920356568756, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 5.107212475633528e-05, |
|
"loss": 0.8157, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.15431828145550197, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 5.1461988304093566e-05, |
|
"loss": 0.7724, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.15548735934531638, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 5.185185185185185e-05, |
|
"loss": 0.7586, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.15665643723513079, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 5.2241715399610133e-05, |
|
"loss": 0.7812, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.1578255151249452, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 0.7486, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1589945930147596, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.302144249512671e-05, |
|
"loss": 0.7339, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.160163670904574, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 5.341130604288499e-05, |
|
"loss": 0.8381, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.16133274879438841, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 5.3801169590643275e-05, |
|
"loss": 0.774, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.16250182668420282, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.419103313840156e-05, |
|
"loss": 0.8114, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.16367090457401726, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 5.458089668615984e-05, |
|
"loss": 0.7549, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16483998246383166, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 5.4970760233918126e-05, |
|
"loss": 0.7393, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.16600906035364607, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 5.536062378167641e-05, |
|
"loss": 0.7607, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.16717813824346048, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 5.57504873294347e-05, |
|
"loss": 0.8158, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.16834721613327489, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 5.6140350877192984e-05, |
|
"loss": 0.7403, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.1695162940230893, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 5.653021442495127e-05, |
|
"loss": 0.7278, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1706853719129037, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 5.692007797270955e-05, |
|
"loss": 0.821, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.1718544498027181, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 5.7309941520467835e-05, |
|
"loss": 0.8276, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.1730235276925325, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 5.769980506822612e-05, |
|
"loss": 0.8505, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.17419260558234692, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 5.80896686159844e-05, |
|
"loss": 0.7941, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.17536168347216133, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 5.847953216374269e-05, |
|
"loss": 0.8621, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17653076136197574, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.8869395711500976e-05, |
|
"loss": 0.8071, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.17769983925179014, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 5.925925925925926e-05, |
|
"loss": 0.803, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.17886891714160455, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 5.9649122807017544e-05, |
|
"loss": 0.7569, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.18003799503141896, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 6.003898635477583e-05, |
|
"loss": 0.7722, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.18120707292123336, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 6.042884990253411e-05, |
|
"loss": 0.7746, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1823761508110478, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 6.0818713450292395e-05, |
|
"loss": 0.7292, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.1835452287008622, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 6.120857699805068e-05, |
|
"loss": 0.7751, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.1847143065906766, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 6.159844054580897e-05, |
|
"loss": 0.7473, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.18588338448049102, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 6.198830409356725e-05, |
|
"loss": 0.7718, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.18705246237030543, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 6.237816764132554e-05, |
|
"loss": 0.7812, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.18822154026011984, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 6.276803118908382e-05, |
|
"loss": 0.7552, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.18939061814993424, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 6.31578947368421e-05, |
|
"loss": 0.7244, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.19055969603974865, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 6.354775828460039e-05, |
|
"loss": 0.8182, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.19172877392956306, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 6.393762183235867e-05, |
|
"loss": 0.7578, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.19289785181937746, |
|
"grad_norm": 0.875, |
|
"learning_rate": 6.432748538011695e-05, |
|
"loss": 0.8755, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.19406692970919187, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 6.471734892787524e-05, |
|
"loss": 0.7436, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.19523600759900628, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 6.510721247563352e-05, |
|
"loss": 0.7415, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.19640508548882069, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 6.549707602339182e-05, |
|
"loss": 0.8082, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.1975741633786351, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 6.58869395711501e-05, |
|
"loss": 0.7226, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.1987432412684495, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 6.627680311890839e-05, |
|
"loss": 0.7408, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1999123191582639, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.7752, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.20108139704807834, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 6.705653021442495e-05, |
|
"loss": 0.7313, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.20225047493789275, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 6.744639376218324e-05, |
|
"loss": 0.7164, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.20341955282770716, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 6.783625730994152e-05, |
|
"loss": 0.7836, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.20458863071752156, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 6.82261208576998e-05, |
|
"loss": 0.8936, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.20575770860733597, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 6.861598440545809e-05, |
|
"loss": 0.7386, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.20692678649715038, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 6.900584795321637e-05, |
|
"loss": 0.8893, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.20809586438696479, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 6.939571150097466e-05, |
|
"loss": 0.715, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.2092649422767792, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 6.978557504873294e-05, |
|
"loss": 0.7087, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.2104340201665936, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 7.017543859649122e-05, |
|
"loss": 0.7067, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.211603098056408, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 7.05653021442495e-05, |
|
"loss": 0.6979, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.2127721759462224, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 7.095516569200779e-05, |
|
"loss": 0.8533, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.21394125383603682, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 7.134502923976609e-05, |
|
"loss": 0.6971, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.21511033172585123, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.173489278752437e-05, |
|
"loss": 0.7152, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.21627940961566564, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 7.212475633528265e-05, |
|
"loss": 0.7176, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.21744848750548004, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.251461988304094e-05, |
|
"loss": 0.7054, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.21861756539529445, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 7.290448343079922e-05, |
|
"loss": 0.7157, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.21978664328510886, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.32943469785575e-05, |
|
"loss": 0.742, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.2209557211749233, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 7.368421052631579e-05, |
|
"loss": 0.7016, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.2221247990647377, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 0.7263, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2232938769545521, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.446393762183236e-05, |
|
"loss": 0.7386, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.2244629548443665, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.485380116959064e-05, |
|
"loss": 0.7488, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.22563203273418092, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 7.524366471734892e-05, |
|
"loss": 0.7304, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.22680111062399533, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 7.563352826510721e-05, |
|
"loss": 0.6926, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.22797018851380974, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 7.602339181286549e-05, |
|
"loss": 0.7258, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.22913926640362414, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 7.641325536062378e-05, |
|
"loss": 0.7169, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.23030834429343855, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.680311890838207e-05, |
|
"loss": 0.6962, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.23147742218325296, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.719298245614036e-05, |
|
"loss": 0.718, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.23264650007306736, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.758284600389864e-05, |
|
"loss": 0.7106, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.23381557796288177, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.797270955165692e-05, |
|
"loss": 0.739, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.23498465585269618, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 7.836257309941521e-05, |
|
"loss": 0.7037, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.23615373374251059, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 7.875243664717349e-05, |
|
"loss": 0.717, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.237322811632325, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.914230019493177e-05, |
|
"loss": 0.7041, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.2384918895221394, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 7.953216374269006e-05, |
|
"loss": 0.7016, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.23966096741195383, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.992202729044834e-05, |
|
"loss": 0.7423, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24083004530176824, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 8.031189083820663e-05, |
|
"loss": 0.7128, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.24199912319158265, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 8.070175438596491e-05, |
|
"loss": 0.7144, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.24316820108139706, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 8.109161793372319e-05, |
|
"loss": 0.6987, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.24433727897121146, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.148148148148148e-05, |
|
"loss": 0.6946, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.24550635686102587, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.187134502923976e-05, |
|
"loss": 0.6954, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.24667543475084028, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 8.226120857699804e-05, |
|
"loss": 0.7507, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.24784451264065469, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.265107212475634e-05, |
|
"loss": 0.9267, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.2490135905304691, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.304093567251462e-05, |
|
"loss": 0.7222, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.2501826684202835, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 8.343079922027291e-05, |
|
"loss": 0.8919, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.25135174631009793, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 8.382066276803119e-05, |
|
"loss": 0.7451, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.25252082419991234, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.421052631578948e-05, |
|
"loss": 0.7027, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.25368990208972675, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 8.460038986354776e-05, |
|
"loss": 0.712, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.25485897997954116, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.499025341130604e-05, |
|
"loss": 0.7122, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.25602805786935556, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 8.538011695906433e-05, |
|
"loss": 0.6871, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.25719713575916997, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.576998050682261e-05, |
|
"loss": 0.7278, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2583662136489844, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 8.61598440545809e-05, |
|
"loss": 0.7023, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.2595352915387988, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 8.654970760233918e-05, |
|
"loss": 0.7167, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.2607043694286132, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 8.693957115009746e-05, |
|
"loss": 0.7346, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.2618734473184276, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 8.732943469785574e-05, |
|
"loss": 0.7223, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.263042525208242, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 8.771929824561403e-05, |
|
"loss": 0.7032, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2642116030980564, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 8.810916179337231e-05, |
|
"loss": 0.6804, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.2653806809878708, |
|
"grad_norm": 0.875, |
|
"learning_rate": 8.849902534113061e-05, |
|
"loss": 0.7224, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.26654975887768523, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.7138, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.26771883676749963, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.927875243664718e-05, |
|
"loss": 0.7065, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.26888791465731404, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 8.966861598440546e-05, |
|
"loss": 0.7289, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.27005699254712845, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 9.005847953216374e-05, |
|
"loss": 0.6917, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.27122607043694286, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.044834307992203e-05, |
|
"loss": 0.7493, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.27239514832675726, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.083820662768031e-05, |
|
"loss": 0.7075, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.27356422621657167, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 9.12280701754386e-05, |
|
"loss": 0.6926, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.2747333041063861, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.161793372319688e-05, |
|
"loss": 0.6714, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2759023819962005, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 9.200779727095516e-05, |
|
"loss": 0.6973, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.2770714598860149, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 9.239766081871345e-05, |
|
"loss": 0.7289, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.2782405377758293, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 9.278752436647173e-05, |
|
"loss": 0.7296, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.2794096156656437, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 9.317738791423001e-05, |
|
"loss": 0.7229, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.2805786935554581, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.35672514619883e-05, |
|
"loss": 0.7158, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2817477714452725, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 9.39571150097466e-05, |
|
"loss": 0.6862, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.28291684933508693, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.434697855750488e-05, |
|
"loss": 0.6535, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.28408592722490134, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.473684210526316e-05, |
|
"loss": 0.703, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.28525500511471574, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.512670565302145e-05, |
|
"loss": 0.7296, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.28642408300453015, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 9.551656920077973e-05, |
|
"loss": 0.7061, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2875931608943446, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 9.590643274853801e-05, |
|
"loss": 0.6731, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.288762238784159, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.62962962962963e-05, |
|
"loss": 0.7141, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.2899313166739734, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 9.668615984405458e-05, |
|
"loss": 0.7874, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.29110039456378783, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 9.707602339181286e-05, |
|
"loss": 0.7179, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.29226947245360224, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.746588693957115e-05, |
|
"loss": 0.6819, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.29343855034341665, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 9.785575048732943e-05, |
|
"loss": 0.688, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.29460762823323106, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.824561403508771e-05, |
|
"loss": 0.8356, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.29577670612304546, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.8635477582846e-05, |
|
"loss": 0.7733, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.29694578401285987, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 9.902534113060428e-05, |
|
"loss": 0.7087, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.2981148619026743, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 9.941520467836257e-05, |
|
"loss": 0.7023, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2992839397924887, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 9.980506822612086e-05, |
|
"loss": 0.7087, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.3004530176823031, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.00010019493177387915, |
|
"loss": 0.7002, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.3016220955721175, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00010058479532163743, |
|
"loss": 0.7355, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.3027911734619319, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00010097465886939573, |
|
"loss": 0.6983, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.3039602513517463, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.000101364522417154, |
|
"loss": 0.7016, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3051293292415607, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0001017543859649123, |
|
"loss": 0.7096, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.3062984071313751, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00010214424951267056, |
|
"loss": 0.7099, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.30746748502118953, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00010253411306042886, |
|
"loss": 0.6931, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.30863656291100394, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00010292397660818713, |
|
"loss": 0.7427, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.30980564080081835, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00010331384015594543, |
|
"loss": 0.6609, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.31097471869063276, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0001037037037037037, |
|
"loss": 0.7195, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.31214379658044716, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.000104093567251462, |
|
"loss": 0.6735, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.31331287447026157, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00010448343079922027, |
|
"loss": 0.7455, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.314481952360076, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00010487329434697856, |
|
"loss": 0.6683, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.3156510302498904, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 0.6981, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3168201081397048, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00010565302144249513, |
|
"loss": 0.65, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.3179891860295192, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.00010604288499025342, |
|
"loss": 0.7997, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.3191582639193336, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00010643274853801171, |
|
"loss": 0.7143, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.320327341809148, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00010682261208576998, |
|
"loss": 0.6619, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.3214964196989624, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00010721247563352828, |
|
"loss": 0.6632, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.32266549758877683, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00010760233918128655, |
|
"loss": 0.6844, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.32383457547859124, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00010799220272904485, |
|
"loss": 0.698, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.32500365336840564, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00010838206627680312, |
|
"loss": 0.6132, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.3261727312582201, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00010877192982456141, |
|
"loss": 0.6507, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.3273418091480345, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00010916179337231968, |
|
"loss": 0.696, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3285108870378489, |
|
"grad_norm": 3.0, |
|
"learning_rate": 0.00010955165692007798, |
|
"loss": 0.926, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.3296799649276633, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00010994152046783625, |
|
"loss": 0.7797, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.33084904281747773, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.00011033138401559455, |
|
"loss": 0.7379, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.33201812070729214, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00011072124756335282, |
|
"loss": 0.679, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.33318719859710655, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 0.6668, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.33435627648692096, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0001115009746588694, |
|
"loss": 0.7378, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.33552535437673536, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0001118908382066277, |
|
"loss": 0.7341, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.33669443226654977, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00011228070175438597, |
|
"loss": 0.6793, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.3378635101563642, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00011267056530214426, |
|
"loss": 0.6926, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.3390325880461786, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00011306042884990253, |
|
"loss": 0.6393, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.340201665935993, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00011345029239766083, |
|
"loss": 0.7207, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.3413707438258074, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0001138401559454191, |
|
"loss": 0.6168, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.3425398217156218, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0001142300194931774, |
|
"loss": 0.712, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.3437088996054362, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00011461988304093567, |
|
"loss": 0.6532, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.3448779774952506, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00011500974658869397, |
|
"loss": 0.6766, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.346047055385065, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00011539961013645224, |
|
"loss": 0.6986, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.34721613327487943, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00011578947368421053, |
|
"loss": 0.6867, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.34838521116469384, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.0001161793372319688, |
|
"loss": 0.8175, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.34955428905450825, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001165692007797271, |
|
"loss": 0.8122, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.35072336694432266, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00011695906432748539, |
|
"loss": 0.7642, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.35189244483413706, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00011734892787524367, |
|
"loss": 0.703, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.35306152272395147, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00011773879142300195, |
|
"loss": 0.7031, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.3542306006137659, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00011812865497076025, |
|
"loss": 0.686, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.3553996785035803, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00011851851851851852, |
|
"loss": 0.6708, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.3565687563933947, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00011890838206627682, |
|
"loss": 0.8084, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3577378342832091, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00011929824561403509, |
|
"loss": 0.6705, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.3589069121730235, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00011968810916179338, |
|
"loss": 0.6409, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.3600759900628379, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00012007797270955165, |
|
"loss": 0.6805, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.3612450679526523, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00012046783625730995, |
|
"loss": 0.7129, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.36241414584246673, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00012085769980506822, |
|
"loss": 0.6819, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.36358322373228114, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00012124756335282652, |
|
"loss": 0.6424, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.3647523016220956, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00012163742690058479, |
|
"loss": 0.704, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.36592137951191, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00012202729044834309, |
|
"loss": 0.8172, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.3670904574017244, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00012241715399610137, |
|
"loss": 0.6486, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.3682595352915388, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00012280701754385965, |
|
"loss": 0.6772, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3694286131813532, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00012319688109161794, |
|
"loss": 0.6997, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.37059769107116763, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00012358674463937622, |
|
"loss": 0.6984, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.37176676896098204, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001239766081871345, |
|
"loss": 0.6796, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.37293584685079645, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0001243664717348928, |
|
"loss": 0.8194, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.37410492474061086, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00012475633528265107, |
|
"loss": 0.7189, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.37527400263042526, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00012514619883040936, |
|
"loss": 0.6934, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.37644308052023967, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00012553606237816764, |
|
"loss": 0.6885, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.3776121584100541, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00012592592592592592, |
|
"loss": 0.671, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.3787812362998685, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0001263157894736842, |
|
"loss": 0.6515, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.3799503141896829, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00012670565302144252, |
|
"loss": 0.6833, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3811193920794973, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00012709551656920077, |
|
"loss": 0.6363, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.3822884699693117, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00012748538011695908, |
|
"loss": 0.6848, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.3834575478591261, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00012787524366471734, |
|
"loss": 0.7383, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.3846266257489405, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00012826510721247565, |
|
"loss": 0.7057, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.3857957036387549, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0001286549707602339, |
|
"loss": 0.7039, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.38696478152856933, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00012904483430799222, |
|
"loss": 0.727, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.38813385941838374, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00012943469785575048, |
|
"loss": 0.7128, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.38930293730819815, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0001298245614035088, |
|
"loss": 0.6446, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.39047201519801256, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00013021442495126704, |
|
"loss": 0.6434, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.39164109308782696, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00013060428849902535, |
|
"loss": 0.642, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.39281017097764137, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00013099415204678364, |
|
"loss": 0.6805, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.3939792488674558, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00013138401559454192, |
|
"loss": 0.6923, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.3951483267572702, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.0001317738791423002, |
|
"loss": 0.7944, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.3963174046470846, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0001321637426900585, |
|
"loss": 0.6876, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.397486482536899, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00013255360623781677, |
|
"loss": 0.6857, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3986555604267134, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00013294346978557506, |
|
"loss": 0.6796, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.3998246383165278, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.6604, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.4009937162063422, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00013372319688109162, |
|
"loss": 0.6999, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.4021627940961567, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001341130604288499, |
|
"loss": 0.663, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.4033318719859711, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.0001345029239766082, |
|
"loss": 0.6553, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4045009498757855, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00013489278752436647, |
|
"loss": 0.6892, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.4056700277655999, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00013528265107212476, |
|
"loss": 0.6529, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.4068391056554143, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00013567251461988304, |
|
"loss": 0.6728, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.4080081835452287, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00013606237816764133, |
|
"loss": 0.6847, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.4091772614350431, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0001364522417153996, |
|
"loss": 0.7475, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.41034633932485753, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0001368421052631579, |
|
"loss": 0.6805, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.41151541721467194, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00013723196881091618, |
|
"loss": 0.8327, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.41268449510448635, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0001376218323586745, |
|
"loss": 0.663, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.41385357299430076, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00013801169590643274, |
|
"loss": 0.6675, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.41502265088411516, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00013840155945419105, |
|
"loss": 0.7458, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.41619172877392957, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0001387914230019493, |
|
"loss": 0.6385, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.417360806663744, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00013918128654970762, |
|
"loss": 0.7979, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.4185298845535584, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00013957115009746588, |
|
"loss": 0.6949, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.4196989624433728, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0001399610136452242, |
|
"loss": 0.6713, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.4208680403331872, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00014035087719298245, |
|
"loss": 0.6503, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4220371182230016, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00014074074074074076, |
|
"loss": 0.6641, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.423206196112816, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.000141130604288499, |
|
"loss": 0.7159, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.4243752740026304, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00014152046783625732, |
|
"loss": 0.786, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.4255443518924448, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00014191033138401558, |
|
"loss": 0.6511, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.42671342978225923, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.0001423001949317739, |
|
"loss": 0.8375, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.42788250767207364, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00014269005847953217, |
|
"loss": 0.7475, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.42905158556188805, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00014307992202729046, |
|
"loss": 0.6989, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.43022066345170246, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00014346978557504874, |
|
"loss": 0.6792, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.43138974134151686, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00014385964912280703, |
|
"loss": 0.7084, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.43255881923133127, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.0001442495126705653, |
|
"loss": 0.7263, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4337278971211457, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0001446393762183236, |
|
"loss": 0.706, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.4348969750109601, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00014502923976608188, |
|
"loss": 0.6989, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.4360660529007745, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00014541910331384016, |
|
"loss": 0.7097, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.4372351307905889, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00014580896686159844, |
|
"loss": 0.6767, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.4384042086804033, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00014619883040935673, |
|
"loss": 0.7045, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4395732865702177, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.000146588693957115, |
|
"loss": 0.6492, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.4407423644600322, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001469785575048733, |
|
"loss": 0.6776, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.4419114423498466, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00014736842105263158, |
|
"loss": 0.8008, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.443080520239661, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00014775828460038986, |
|
"loss": 0.6677, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.4442495981294754, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00014814814814814815, |
|
"loss": 0.6799, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.4454186760192898, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00014853801169590643, |
|
"loss": 0.6626, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.4465877539091042, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001489278752436647, |
|
"loss": 0.6814, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.4477568317989186, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00014931773879142302, |
|
"loss": 0.7181, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.448925909688733, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00014970760233918128, |
|
"loss": 0.681, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.45009498757854743, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0001500974658869396, |
|
"loss": 0.6625, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.45126406546836184, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00015048732943469785, |
|
"loss": 0.6558, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.45243314335817625, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00015087719298245616, |
|
"loss": 0.6968, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.45360222124799066, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00015126705653021442, |
|
"loss": 0.7073, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.45477129913780506, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00015165692007797273, |
|
"loss": 0.7151, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.45594037702761947, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00015204678362573098, |
|
"loss": 0.7192, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4571094549174339, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001524366471734893, |
|
"loss": 0.6937, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.4582785328072483, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00015282651072124755, |
|
"loss": 0.6859, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.4594476106970627, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00015321637426900586, |
|
"loss": 0.6491, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.4606166885868771, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00015360623781676414, |
|
"loss": 0.719, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.4617857664766915, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00015399610136452243, |
|
"loss": 0.6906, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4629548443665059, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001543859649122807, |
|
"loss": 0.6957, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.4641239222563203, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.000154775828460039, |
|
"loss": 0.6863, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.4652930001461347, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00015516569200779728, |
|
"loss": 0.6845, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.46646207803594913, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00015555555555555556, |
|
"loss": 0.6393, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.46763115592576354, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00015594541910331385, |
|
"loss": 0.6739, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.46880023381557795, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00015633528265107213, |
|
"loss": 0.6919, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.46996931170539236, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00015672514619883041, |
|
"loss": 0.6885, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.47113838959520676, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0001571150097465887, |
|
"loss": 0.7075, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.47230746748502117, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00015750487329434698, |
|
"loss": 0.6716, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.4734765453748356, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 0.6595, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.47464562326465, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00015828460038986355, |
|
"loss": 0.6272, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.4758147011544644, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00015867446393762183, |
|
"loss": 0.6849, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.4769837790442788, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00015906432748538012, |
|
"loss": 0.6523, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.4781528569340932, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0001594541910331384, |
|
"loss": 0.7112, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.47932193482390767, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00015984405458089668, |
|
"loss": 0.7073, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4804910127137221, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.000160233918128655, |
|
"loss": 0.6436, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.4816600906035365, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00016062378167641325, |
|
"loss": 0.669, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.4828291684933509, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00016101364522417156, |
|
"loss": 0.8224, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.4839982463831653, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00016140350877192982, |
|
"loss": 0.7526, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.4851673242729797, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00016179337231968813, |
|
"loss": 0.6986, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4863364021627941, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00016218323586744639, |
|
"loss": 0.6496, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.4875054800526085, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0001625730994152047, |
|
"loss": 0.7165, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.4886745579424229, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00016296296296296295, |
|
"loss": 0.6453, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.48984363583223733, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00016335282651072126, |
|
"loss": 0.617, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.49101271372205174, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00016374269005847952, |
|
"loss": 0.6841, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.49218179161186615, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00016413255360623783, |
|
"loss": 0.6668, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.49335086950168056, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0001645224171539961, |
|
"loss": 0.7092, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.49451994739149496, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001649122807017544, |
|
"loss": 0.6479, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.49568902528130937, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00016530214424951268, |
|
"loss": 0.6687, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.4968581031711238, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00016569200779727097, |
|
"loss": 0.6704, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4980271810609382, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00016608187134502925, |
|
"loss": 0.6678, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.4991962589507526, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00016647173489278753, |
|
"loss": 0.6744, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.500365336840567, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00016686159844054582, |
|
"loss": 0.6565, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.5015344147303814, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0001672514619883041, |
|
"loss": 0.6499, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.5027034926201959, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00016764132553606238, |
|
"loss": 0.6619, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5038725705100102, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.00016803118908382067, |
|
"loss": 0.8945, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.5050416483998247, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00016842105263157895, |
|
"loss": 0.7597, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.506210726289639, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00016881091617933723, |
|
"loss": 0.6698, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.5073798041794535, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00016920077972709552, |
|
"loss": 0.7135, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.5085488820692678, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0001695906432748538, |
|
"loss": 0.8298, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5097179599590823, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00016998050682261209, |
|
"loss": 0.6749, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.5108870378488967, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00017037037037037037, |
|
"loss": 0.8318, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.5120561157387111, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00017076023391812865, |
|
"loss": 0.7329, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.5132251936285255, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00017115009746588694, |
|
"loss": 0.7127, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.5143942715183399, |
|
"grad_norm": 9.375, |
|
"learning_rate": 0.00017153996101364522, |
|
"loss": 0.7969, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5155633494081543, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.00017192982456140353, |
|
"loss": 0.8313, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.5167324272979688, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0001723196881091618, |
|
"loss": 0.7693, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.5179015051877831, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0001727095516569201, |
|
"loss": 0.7127, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.5190705830775976, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00017309941520467836, |
|
"loss": 0.7283, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.5202396609674119, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00017348927875243667, |
|
"loss": 0.6591, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5214087388572264, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017387914230019492, |
|
"loss": 0.6794, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.5225778167470407, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00017426900584795323, |
|
"loss": 0.6299, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.5237468946368552, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0001746588693957115, |
|
"loss": 0.73, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.5249159725266695, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001750487329434698, |
|
"loss": 0.6538, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.526085050416484, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00017543859649122806, |
|
"loss": 0.7013, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5272541283062984, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017582846003898637, |
|
"loss": 0.6633, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.5284232061961128, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00017621832358674462, |
|
"loss": 0.6638, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.5295922840859272, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00017660818713450294, |
|
"loss": 0.6616, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.5307613619757416, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00017699805068226122, |
|
"loss": 0.6935, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.531930439865556, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001773879142300195, |
|
"loss": 0.6795, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5330995177553705, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.6797, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.5342685956451848, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00017816764132553607, |
|
"loss": 0.6583, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.5354376735349993, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00017855750487329435, |
|
"loss": 0.6592, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.5366067514248137, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00017894736842105264, |
|
"loss": 0.7534, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.5377758293146281, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00017933723196881092, |
|
"loss": 0.699, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5389449072044425, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0001797270955165692, |
|
"loss": 0.6759, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.5401139850942569, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001801169590643275, |
|
"loss": 0.6676, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.5412830629840714, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00018050682261208577, |
|
"loss": 0.6734, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.5424521408738857, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00018089668615984406, |
|
"loss": 0.6545, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.5436212187637002, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00018128654970760234, |
|
"loss": 0.6863, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5447902966535145, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00018167641325536062, |
|
"loss": 0.6536, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.545959374543329, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0001820662768031189, |
|
"loss": 0.6825, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.5471284524331433, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001824561403508772, |
|
"loss": 0.7075, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.5482975303229578, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00018284600389863547, |
|
"loss": 0.8103, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.5494666082127722, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00018323586744639376, |
|
"loss": 0.7081, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5506356861025866, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00018362573099415207, |
|
"loss": 0.6647, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.551804763992401, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00018401559454191032, |
|
"loss": 0.7202, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.5529738418822154, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00018440545808966864, |
|
"loss": 0.7734, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.5541429197720298, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0001847953216374269, |
|
"loss": 0.6381, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.5553119976618442, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0001851851851851852, |
|
"loss": 0.7235, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5564810755516586, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00018557504873294346, |
|
"loss": 0.8296, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.5576501534414731, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00018596491228070177, |
|
"loss": 0.6901, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.5588192313312874, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00018635477582846003, |
|
"loss": 0.6628, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.5599883092211019, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018674463937621834, |
|
"loss": 0.6689, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.5611573871109162, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0001871345029239766, |
|
"loss": 0.7377, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5623264650007307, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0001875243664717349, |
|
"loss": 0.7217, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.563495542890545, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001879142300194932, |
|
"loss": 0.6776, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.5646646207803595, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00018830409356725147, |
|
"loss": 0.694, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.5658336986701739, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00018869395711500976, |
|
"loss": 0.7305, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.5670027765599883, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018908382066276804, |
|
"loss": 0.6659, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5681718544498027, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00018947368421052632, |
|
"loss": 0.705, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.5693409323396171, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0001898635477582846, |
|
"loss": 0.8348, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.5705100102294315, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001902534113060429, |
|
"loss": 0.7383, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.571679088119246, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00019064327485380117, |
|
"loss": 0.7039, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.5728481660090603, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00019103313840155946, |
|
"loss": 0.7163, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5740172438988748, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00019142300194931774, |
|
"loss": 0.6648, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.5751863217886892, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019181286549707603, |
|
"loss": 0.6896, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.5763553996785036, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001922027290448343, |
|
"loss": 0.7127, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.577524477568318, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001925925925925926, |
|
"loss": 0.6549, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.5786935554581324, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00019298245614035088, |
|
"loss": 0.6581, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5798626333479469, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00019337231968810916, |
|
"loss": 0.6802, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.5810317112377612, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00019376218323586744, |
|
"loss": 0.6906, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.5822007891275757, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00019415204678362573, |
|
"loss": 0.7383, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.58336986701739, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00019454191033138404, |
|
"loss": 0.7141, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.5845389449072045, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001949317738791423, |
|
"loss": 0.6729, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 5130, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.57623446257664e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|