|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.559026837348938, |
|
"learning_rate": 4.998766400914329e-05, |
|
"loss": 4.2591, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.0623748302459717, |
|
"learning_rate": 4.995066821070679e-05, |
|
"loss": 4.084, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.7944402694702148, |
|
"learning_rate": 4.9889049115077005e-05, |
|
"loss": 4.0724, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.1178815364837646, |
|
"learning_rate": 4.980286753286195e-05, |
|
"loss": 3.9116, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.5003485679626465, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 3.7709, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.663772463798523, |
|
"learning_rate": 4.9557181268217227e-05, |
|
"loss": 3.5777, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.7687804698944092, |
|
"learning_rate": 4.939791904846869e-05, |
|
"loss": 3.2731, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.3904050588607788, |
|
"learning_rate": 4.9214579028215776e-05, |
|
"loss": 3.3322, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.6052061319351196, |
|
"learning_rate": 4.900734214192358e-05, |
|
"loss": 3.1392, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.6881974935531616, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 3.0443, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.8101818561553955, |
|
"learning_rate": 4.852201922385564e-05, |
|
"loss": 3.1381, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.9677544832229614, |
|
"learning_rate": 4.8244412147206284e-05, |
|
"loss": 2.907, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.909866213798523, |
|
"learning_rate": 4.794386564209953e-05, |
|
"loss": 2.9507, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.7826056480407715, |
|
"learning_rate": 4.762067631165049e-05, |
|
"loss": 2.9334, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.4618849754333496, |
|
"learning_rate": 4.72751631047092e-05, |
|
"loss": 2.7628, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.7693885564804077, |
|
"learning_rate": 4.690766700109659e-05, |
|
"loss": 2.7438, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.086184024810791, |
|
"learning_rate": 4.65185506750986e-05, |
|
"loss": 2.7949, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 2.5929203033447266, |
|
"learning_rate": 4.610819813755038e-05, |
|
"loss": 2.7056, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 2.221397638320923, |
|
"learning_rate": 4.567701435686404e-05, |
|
"loss": 2.5453, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.4999215602874756, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 2.7453, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 2.4940710067749023, |
|
"learning_rate": 4.4753875309392266e-05, |
|
"loss": 2.5124, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 3.088253974914551, |
|
"learning_rate": 4.426283106939474e-05, |
|
"loss": 2.5935, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 2.6688263416290283, |
|
"learning_rate": 4.375277674076149e-05, |
|
"loss": 2.5463, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 2.6959311962127686, |
|
"learning_rate": 4.3224215685535294e-05, |
|
"loss": 2.6062, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 3.0782675743103027, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 2.4869, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 3.190086603164673, |
|
"learning_rate": 4.211367764821722e-05, |
|
"loss": 2.457, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 3.3880529403686523, |
|
"learning_rate": 4.1532796633091296e-05, |
|
"loss": 2.4223, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 2.6318459510803223, |
|
"learning_rate": 4.093559974371725e-05, |
|
"loss": 2.385, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 2.92958402633667, |
|
"learning_rate": 4.0322676341324415e-05, |
|
"loss": 2.2885, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 3.336378574371338, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 2.4326, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 3.508898973464966, |
|
"learning_rate": 3.905208444630327e-05, |
|
"loss": 2.1526, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 4.225854396820068, |
|
"learning_rate": 3.8395669874474915e-05, |
|
"loss": 2.4194, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 4.112431526184082, |
|
"learning_rate": 3.7726035393759285e-05, |
|
"loss": 2.2839, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 3.599271774291992, |
|
"learning_rate": 3.704384185254288e-05, |
|
"loss": 2.378, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 4.479362964630127, |
|
"learning_rate": 3.634976249348867e-05, |
|
"loss": 2.0319, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 4.234251499176025, |
|
"learning_rate": 3.564448228912682e-05, |
|
"loss": 2.021, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 4.8591461181640625, |
|
"learning_rate": 3.4928697265869515e-05, |
|
"loss": 1.9893, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 6.527431488037109, |
|
"learning_rate": 3.4203113817116957e-05, |
|
"loss": 2.1713, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 5.432151794433594, |
|
"learning_rate": 3.346844800613229e-05, |
|
"loss": 2.2043, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 4.323482513427734, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 2.2359, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 3.9754505157470703, |
|
"learning_rate": 3.1974777650980735e-05, |
|
"loss": 2.0855, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 4.956503391265869, |
|
"learning_rate": 3.121724717912138e-05, |
|
"loss": 1.9709, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 5.2445969581604, |
|
"learning_rate": 3.045358103491357e-05, |
|
"loss": 2.062, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 4.901844024658203, |
|
"learning_rate": 2.9684532864643122e-05, |
|
"loss": 2.0368, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 4.711243629455566, |
|
"learning_rate": 2.8910861626005776e-05, |
|
"loss": 1.8624, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 4.800491809844971, |
|
"learning_rate": 2.8133330839107608e-05, |
|
"loss": 2.0423, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 5.560423851013184, |
|
"learning_rate": 2.7352707832962865e-05, |
|
"loss": 1.905, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 7.123173713684082, |
|
"learning_rate": 2.656976298823284e-05, |
|
"loss": 1.6245, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 6.563107013702393, |
|
"learning_rate": 2.578526897695321e-05, |
|
"loss": 1.8757, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 5.408273220062256, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.9541, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 10.2, |
|
"grad_norm": 5.59449577331543, |
|
"learning_rate": 2.4214731023046793e-05, |
|
"loss": 1.822, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"grad_norm": 7.534347057342529, |
|
"learning_rate": 2.3430237011767167e-05, |
|
"loss": 1.8813, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"grad_norm": 7.906167984008789, |
|
"learning_rate": 2.2647292167037144e-05, |
|
"loss": 1.8534, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"grad_norm": 5.6607184410095215, |
|
"learning_rate": 2.186666916089239e-05, |
|
"loss": 1.7331, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 6.6628217697143555, |
|
"learning_rate": 2.1089138373994223e-05, |
|
"loss": 1.6813, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 6.011382579803467, |
|
"learning_rate": 2.031546713535688e-05, |
|
"loss": 1.5243, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 6.615753173828125, |
|
"learning_rate": 1.9546418965086442e-05, |
|
"loss": 1.7315, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"grad_norm": 6.512957572937012, |
|
"learning_rate": 1.8782752820878634e-05, |
|
"loss": 1.797, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 11.8, |
|
"grad_norm": 6.8899664878845215, |
|
"learning_rate": 1.802522234901927e-05, |
|
"loss": 1.7544, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 6.717526435852051, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 1.7081, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"grad_norm": 5.812675952911377, |
|
"learning_rate": 1.6531551993867717e-05, |
|
"loss": 1.6276, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"grad_norm": 6.911491870880127, |
|
"learning_rate": 1.5796886182883053e-05, |
|
"loss": 1.6319, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 12.6, |
|
"grad_norm": 8.3098726272583, |
|
"learning_rate": 1.5071302734130489e-05, |
|
"loss": 1.5262, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 6.8229756355285645, |
|
"learning_rate": 1.4355517710873184e-05, |
|
"loss": 1.6186, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 6.8094258308410645, |
|
"learning_rate": 1.3650237506511331e-05, |
|
"loss": 1.6026, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"grad_norm": 6.559398651123047, |
|
"learning_rate": 1.2956158147457115e-05, |
|
"loss": 1.4957, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 13.4, |
|
"grad_norm": 7.712869167327881, |
|
"learning_rate": 1.2273964606240718e-05, |
|
"loss": 1.5775, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"grad_norm": 8.607151985168457, |
|
"learning_rate": 1.1604330125525079e-05, |
|
"loss": 1.6038, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 13.8, |
|
"grad_norm": 7.38192081451416, |
|
"learning_rate": 1.0947915553696742e-05, |
|
"loss": 1.4489, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 9.552181243896484, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 1.6264, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"grad_norm": 6.163412094116211, |
|
"learning_rate": 9.677323658675594e-06, |
|
"loss": 1.6046, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 8.483848571777344, |
|
"learning_rate": 9.064400256282757e-06, |
|
"loss": 1.4736, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 14.6, |
|
"grad_norm": 7.673094749450684, |
|
"learning_rate": 8.467203366908707e-06, |
|
"loss": 1.493, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 14.8, |
|
"grad_norm": 7.977968692779541, |
|
"learning_rate": 7.886322351782783e-06, |
|
"loss": 1.5078, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 7.63812255859375, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 1.4198, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 15.2, |
|
"grad_norm": 6.485208034515381, |
|
"learning_rate": 6.775784314464717e-06, |
|
"loss": 1.5416, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 15.4, |
|
"grad_norm": 8.849021911621094, |
|
"learning_rate": 6.247223259238513e-06, |
|
"loss": 1.3864, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"grad_norm": 7.422196388244629, |
|
"learning_rate": 5.737168930605272e-06, |
|
"loss": 1.4536, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 15.8, |
|
"grad_norm": 7.654792785644531, |
|
"learning_rate": 5.24612469060774e-06, |
|
"loss": 1.4819, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 7.572329044342041, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 1.4569, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 16.2, |
|
"grad_norm": 7.488677024841309, |
|
"learning_rate": 4.322985643135952e-06, |
|
"loss": 1.4596, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"grad_norm": 7.173449993133545, |
|
"learning_rate": 3.891801862449629e-06, |
|
"loss": 1.2999, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 16.6, |
|
"grad_norm": 7.496935844421387, |
|
"learning_rate": 3.4814493249014116e-06, |
|
"loss": 1.4009, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"grad_norm": 7.768974304199219, |
|
"learning_rate": 3.0923329989034132e-06, |
|
"loss": 1.6062, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 7.606301784515381, |
|
"learning_rate": 2.7248368952908053e-06, |
|
"loss": 1.3563, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"grad_norm": 9.161933898925781, |
|
"learning_rate": 2.379323688349516e-06, |
|
"loss": 1.2701, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 17.4, |
|
"grad_norm": 9.012870788574219, |
|
"learning_rate": 2.0561343579004715e-06, |
|
"loss": 1.539, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"grad_norm": 7.092794418334961, |
|
"learning_rate": 1.7555878527937164e-06, |
|
"loss": 1.4352, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"grad_norm": 6.655755996704102, |
|
"learning_rate": 1.4779807761443636e-06, |
|
"loss": 1.2718, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 6.042492866516113, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 1.5636, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 18.2, |
|
"grad_norm": 7.48867654800415, |
|
"learning_rate": 9.926578580764234e-07, |
|
"loss": 1.6322, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"grad_norm": 7.198723793029785, |
|
"learning_rate": 7.854209717842231e-07, |
|
"loss": 1.3335, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 18.6, |
|
"grad_norm": 8.145171165466309, |
|
"learning_rate": 6.020809515313142e-07, |
|
"loss": 1.2486, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 18.8, |
|
"grad_norm": 8.835026741027832, |
|
"learning_rate": 4.4281873178278475e-07, |
|
"loss": 1.41, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 8.490466117858887, |
|
"learning_rate": 3.077914851215585e-07, |
|
"loss": 1.3855, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 6.034891605377197, |
|
"learning_rate": 1.9713246713805588e-07, |
|
"loss": 1.4357, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 19.4, |
|
"grad_norm": 7.8184919357299805, |
|
"learning_rate": 1.109508849230001e-07, |
|
"loss": 1.224, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 19.6, |
|
"grad_norm": 8.390954971313477, |
|
"learning_rate": 4.9331789293211026e-08, |
|
"loss": 1.4121, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 19.8, |
|
"grad_norm": 8.34197998046875, |
|
"learning_rate": 1.233599085671e-08, |
|
"loss": 1.4843, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 7.5700273513793945, |
|
"learning_rate": 0.0, |
|
"loss": 1.52, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 500, |
|
"total_flos": 1.3524244538720256e+16, |
|
"train_loss": 2.0708214292526246, |
|
"train_runtime": 754.3437, |
|
"train_samples_per_second": 10.605, |
|
"train_steps_per_second": 0.663 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 100, |
|
"total_flos": 1.3524244538720256e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|