{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "grad_norm": 1.559026837348938, "learning_rate": 4.998766400914329e-05, "loss": 4.2591, "step": 5 }, { "epoch": 0.4, "grad_norm": 2.0623748302459717, "learning_rate": 4.995066821070679e-05, "loss": 4.084, "step": 10 }, { "epoch": 0.6, "grad_norm": 1.7944402694702148, "learning_rate": 4.9889049115077005e-05, "loss": 4.0724, "step": 15 }, { "epoch": 0.8, "grad_norm": 2.1178815364837646, "learning_rate": 4.980286753286195e-05, "loss": 3.9116, "step": 20 }, { "epoch": 1.0, "grad_norm": 2.5003485679626465, "learning_rate": 4.9692208514878444e-05, "loss": 3.7709, "step": 25 }, { "epoch": 1.2, "grad_norm": 1.663772463798523, "learning_rate": 4.9557181268217227e-05, "loss": 3.5777, "step": 30 }, { "epoch": 1.4, "grad_norm": 1.7687804698944092, "learning_rate": 4.939791904846869e-05, "loss": 3.2731, "step": 35 }, { "epoch": 1.6, "grad_norm": 1.3904050588607788, "learning_rate": 4.9214579028215776e-05, "loss": 3.3322, "step": 40 }, { "epoch": 1.8, "grad_norm": 1.6052061319351196, "learning_rate": 4.900734214192358e-05, "loss": 3.1392, "step": 45 }, { "epoch": 2.0, "grad_norm": 1.6881974935531616, "learning_rate": 4.877641290737884e-05, "loss": 3.0443, "step": 50 }, { "epoch": 2.2, "grad_norm": 1.8101818561553955, "learning_rate": 4.852201922385564e-05, "loss": 3.1381, "step": 55 }, { "epoch": 2.4, "grad_norm": 1.9677544832229614, "learning_rate": 4.8244412147206284e-05, "loss": 2.907, "step": 60 }, { "epoch": 2.6, "grad_norm": 1.909866213798523, "learning_rate": 4.794386564209953e-05, "loss": 2.9507, "step": 65 }, { "epoch": 2.8, "grad_norm": 1.7826056480407715, "learning_rate": 4.762067631165049e-05, "loss": 2.9334, "step": 70 }, { "epoch": 3.0, "grad_norm": 2.4618849754333496, "learning_rate": 4.72751631047092e-05, "loss": 2.7628, "step": 75 }, { "epoch": 3.2, "grad_norm": 1.7693885564804077, "learning_rate": 4.690766700109659e-05, "loss": 2.7438, "step": 80 }, { "epoch": 3.4, "grad_norm": 2.086184024810791, "learning_rate": 4.65185506750986e-05, "loss": 2.7949, "step": 85 }, { "epoch": 3.6, "grad_norm": 2.5929203033447266, "learning_rate": 4.610819813755038e-05, "loss": 2.7056, "step": 90 }, { "epoch": 3.8, "grad_norm": 2.221397638320923, "learning_rate": 4.567701435686404e-05, "loss": 2.5453, "step": 95 }, { "epoch": 4.0, "grad_norm": 2.4999215602874756, "learning_rate": 4.522542485937369e-05, "loss": 2.7453, "step": 100 }, { "epoch": 4.2, "grad_norm": 2.4940710067749023, "learning_rate": 4.4753875309392266e-05, "loss": 2.5124, "step": 105 }, { "epoch": 4.4, "grad_norm": 3.088253974914551, "learning_rate": 4.426283106939474e-05, "loss": 2.5935, "step": 110 }, { "epoch": 4.6, "grad_norm": 2.6688263416290283, "learning_rate": 4.375277674076149e-05, "loss": 2.5463, "step": 115 }, { "epoch": 4.8, "grad_norm": 2.6959311962127686, "learning_rate": 4.3224215685535294e-05, "loss": 2.6062, "step": 120 }, { "epoch": 5.0, "grad_norm": 3.0782675743103027, "learning_rate": 4.267766952966369e-05, "loss": 2.4869, "step": 125 }, { "epoch": 5.2, "grad_norm": 3.190086603164673, "learning_rate": 4.211367764821722e-05, "loss": 2.457, "step": 130 }, { "epoch": 5.4, "grad_norm": 3.3880529403686523, "learning_rate": 4.1532796633091296e-05, "loss": 2.4223, "step": 135 }, { "epoch": 5.6, "grad_norm": 2.6318459510803223, "learning_rate": 4.093559974371725e-05, "loss": 2.385, "step": 140 }, { "epoch": 5.8, "grad_norm": 2.92958402633667, "learning_rate": 4.0322676341324415e-05, "loss": 2.2885, "step": 145 }, { "epoch": 6.0, "grad_norm": 3.336378574371338, "learning_rate": 3.969463130731183e-05, "loss": 2.4326, "step": 150 }, { "epoch": 6.2, "grad_norm": 3.508898973464966, "learning_rate": 3.905208444630327e-05, "loss": 2.1526, "step": 155 }, { "epoch": 6.4, "grad_norm": 4.225854396820068, "learning_rate": 3.8395669874474915e-05, "loss": 2.4194, "step": 160 }, { "epoch": 6.6, "grad_norm": 4.112431526184082, "learning_rate": 3.7726035393759285e-05, "loss": 2.2839, "step": 165 }, { "epoch": 6.8, "grad_norm": 3.599271774291992, "learning_rate": 3.704384185254288e-05, "loss": 2.378, "step": 170 }, { "epoch": 7.0, "grad_norm": 4.479362964630127, "learning_rate": 3.634976249348867e-05, "loss": 2.0319, "step": 175 }, { "epoch": 7.2, "grad_norm": 4.234251499176025, "learning_rate": 3.564448228912682e-05, "loss": 2.021, "step": 180 }, { "epoch": 7.4, "grad_norm": 4.8591461181640625, "learning_rate": 3.4928697265869515e-05, "loss": 1.9893, "step": 185 }, { "epoch": 7.6, "grad_norm": 6.527431488037109, "learning_rate": 3.4203113817116957e-05, "loss": 2.1713, "step": 190 }, { "epoch": 7.8, "grad_norm": 5.432151794433594, "learning_rate": 3.346844800613229e-05, "loss": 2.2043, "step": 195 }, { "epoch": 8.0, "grad_norm": 4.323482513427734, "learning_rate": 3.272542485937369e-05, "loss": 2.2359, "step": 200 }, { "epoch": 8.2, "grad_norm": 3.9754505157470703, "learning_rate": 3.1974777650980735e-05, "loss": 2.0855, "step": 205 }, { "epoch": 8.4, "grad_norm": 4.956503391265869, "learning_rate": 3.121724717912138e-05, "loss": 1.9709, "step": 210 }, { "epoch": 8.6, "grad_norm": 5.2445969581604, "learning_rate": 3.045358103491357e-05, "loss": 2.062, "step": 215 }, { "epoch": 8.8, "grad_norm": 4.901844024658203, "learning_rate": 2.9684532864643122e-05, "loss": 2.0368, "step": 220 }, { "epoch": 9.0, "grad_norm": 4.711243629455566, "learning_rate": 2.8910861626005776e-05, "loss": 1.8624, "step": 225 }, { "epoch": 9.2, "grad_norm": 4.800491809844971, "learning_rate": 2.8133330839107608e-05, "loss": 2.0423, "step": 230 }, { "epoch": 9.4, "grad_norm": 5.560423851013184, "learning_rate": 2.7352707832962865e-05, "loss": 1.905, "step": 235 }, { "epoch": 9.6, "grad_norm": 7.123173713684082, "learning_rate": 2.656976298823284e-05, "loss": 1.6245, "step": 240 }, { "epoch": 9.8, "grad_norm": 6.563107013702393, "learning_rate": 2.578526897695321e-05, "loss": 1.8757, "step": 245 }, { "epoch": 10.0, "grad_norm": 5.408273220062256, "learning_rate": 2.5e-05, "loss": 1.9541, "step": 250 }, { "epoch": 10.2, "grad_norm": 5.59449577331543, "learning_rate": 2.4214731023046793e-05, "loss": 1.822, "step": 255 }, { "epoch": 10.4, "grad_norm": 7.534347057342529, "learning_rate": 2.3430237011767167e-05, "loss": 1.8813, "step": 260 }, { "epoch": 10.6, "grad_norm": 7.906167984008789, "learning_rate": 2.2647292167037144e-05, "loss": 1.8534, "step": 265 }, { "epoch": 10.8, "grad_norm": 5.6607184410095215, "learning_rate": 2.186666916089239e-05, "loss": 1.7331, "step": 270 }, { "epoch": 11.0, "grad_norm": 6.6628217697143555, "learning_rate": 2.1089138373994223e-05, "loss": 1.6813, "step": 275 }, { "epoch": 11.2, "grad_norm": 6.011382579803467, "learning_rate": 2.031546713535688e-05, "loss": 1.5243, "step": 280 }, { "epoch": 11.4, "grad_norm": 6.615753173828125, "learning_rate": 1.9546418965086442e-05, "loss": 1.7315, "step": 285 }, { "epoch": 11.6, "grad_norm": 6.512957572937012, "learning_rate": 1.8782752820878634e-05, "loss": 1.797, "step": 290 }, { "epoch": 11.8, "grad_norm": 6.8899664878845215, "learning_rate": 1.802522234901927e-05, "loss": 1.7544, "step": 295 }, { "epoch": 12.0, "grad_norm": 6.717526435852051, "learning_rate": 1.7274575140626318e-05, "loss": 1.7081, "step": 300 }, { "epoch": 12.2, "grad_norm": 5.812675952911377, "learning_rate": 1.6531551993867717e-05, "loss": 1.6276, "step": 305 }, { "epoch": 12.4, "grad_norm": 6.911491870880127, "learning_rate": 1.5796886182883053e-05, "loss": 1.6319, "step": 310 }, { "epoch": 12.6, "grad_norm": 8.3098726272583, "learning_rate": 1.5071302734130489e-05, "loss": 1.5262, "step": 315 }, { "epoch": 12.8, "grad_norm": 6.8229756355285645, "learning_rate": 1.4355517710873184e-05, "loss": 1.6186, "step": 320 }, { "epoch": 13.0, "grad_norm": 6.8094258308410645, "learning_rate": 1.3650237506511331e-05, "loss": 1.6026, "step": 325 }, { "epoch": 13.2, "grad_norm": 6.559398651123047, "learning_rate": 1.2956158147457115e-05, "loss": 1.4957, "step": 330 }, { "epoch": 13.4, "grad_norm": 7.712869167327881, "learning_rate": 1.2273964606240718e-05, "loss": 1.5775, "step": 335 }, { "epoch": 13.6, "grad_norm": 8.607151985168457, "learning_rate": 1.1604330125525079e-05, "loss": 1.6038, "step": 340 }, { "epoch": 13.8, "grad_norm": 7.38192081451416, "learning_rate": 1.0947915553696742e-05, "loss": 1.4489, "step": 345 }, { "epoch": 14.0, "grad_norm": 9.552181243896484, "learning_rate": 1.0305368692688174e-05, "loss": 1.6264, "step": 350 }, { "epoch": 14.2, "grad_norm": 6.163412094116211, "learning_rate": 9.677323658675594e-06, "loss": 1.6046, "step": 355 }, { "epoch": 14.4, "grad_norm": 8.483848571777344, "learning_rate": 9.064400256282757e-06, "loss": 1.4736, "step": 360 }, { "epoch": 14.6, "grad_norm": 7.673094749450684, "learning_rate": 8.467203366908707e-06, "loss": 1.493, "step": 365 }, { "epoch": 14.8, "grad_norm": 7.977968692779541, "learning_rate": 7.886322351782783e-06, "loss": 1.5078, "step": 370 }, { "epoch": 15.0, "grad_norm": 7.63812255859375, "learning_rate": 7.3223304703363135e-06, "loss": 1.4198, "step": 375 }, { "epoch": 15.2, "grad_norm": 6.485208034515381, "learning_rate": 6.775784314464717e-06, "loss": 1.5416, "step": 380 }, { "epoch": 15.4, "grad_norm": 8.849021911621094, "learning_rate": 6.247223259238513e-06, "loss": 1.3864, "step": 385 }, { "epoch": 15.6, "grad_norm": 7.422196388244629, "learning_rate": 5.737168930605272e-06, "loss": 1.4536, "step": 390 }, { "epoch": 15.8, "grad_norm": 7.654792785644531, "learning_rate": 5.24612469060774e-06, "loss": 1.4819, "step": 395 }, { "epoch": 16.0, "grad_norm": 7.572329044342041, "learning_rate": 4.7745751406263165e-06, "loss": 1.4569, "step": 400 }, { "epoch": 16.2, "grad_norm": 7.488677024841309, "learning_rate": 4.322985643135952e-06, "loss": 1.4596, "step": 405 }, { "epoch": 16.4, "grad_norm": 7.173449993133545, "learning_rate": 3.891801862449629e-06, "loss": 1.2999, "step": 410 }, { "epoch": 16.6, "grad_norm": 7.496935844421387, "learning_rate": 3.4814493249014116e-06, "loss": 1.4009, "step": 415 }, { "epoch": 16.8, "grad_norm": 7.768974304199219, "learning_rate": 3.0923329989034132e-06, "loss": 1.6062, "step": 420 }, { "epoch": 17.0, "grad_norm": 7.606301784515381, "learning_rate": 2.7248368952908053e-06, "loss": 1.3563, "step": 425 }, { "epoch": 17.2, "grad_norm": 9.161933898925781, "learning_rate": 2.379323688349516e-06, "loss": 1.2701, "step": 430 }, { "epoch": 17.4, "grad_norm": 9.012870788574219, "learning_rate": 2.0561343579004715e-06, "loss": 1.539, "step": 435 }, { "epoch": 17.6, "grad_norm": 7.092794418334961, "learning_rate": 1.7555878527937164e-06, "loss": 1.4352, "step": 440 }, { "epoch": 17.8, "grad_norm": 6.655755996704102, "learning_rate": 1.4779807761443636e-06, "loss": 1.2718, "step": 445 }, { "epoch": 18.0, "grad_norm": 6.042492866516113, "learning_rate": 1.2235870926211619e-06, "loss": 1.5636, "step": 450 }, { "epoch": 18.2, "grad_norm": 7.48867654800415, "learning_rate": 9.926578580764234e-07, "loss": 1.6322, "step": 455 }, { "epoch": 18.4, "grad_norm": 7.198723793029785, "learning_rate": 7.854209717842231e-07, "loss": 1.3335, "step": 460 }, { "epoch": 18.6, "grad_norm": 8.145171165466309, "learning_rate": 6.020809515313142e-07, "loss": 1.2486, "step": 465 }, { "epoch": 18.8, "grad_norm": 8.835026741027832, "learning_rate": 4.4281873178278475e-07, "loss": 1.41, "step": 470 }, { "epoch": 19.0, "grad_norm": 8.490466117858887, "learning_rate": 3.077914851215585e-07, "loss": 1.3855, "step": 475 }, { "epoch": 19.2, "grad_norm": 6.034891605377197, "learning_rate": 1.9713246713805588e-07, "loss": 1.4357, "step": 480 }, { "epoch": 19.4, "grad_norm": 7.8184919357299805, "learning_rate": 1.109508849230001e-07, "loss": 1.224, "step": 485 }, { "epoch": 19.6, "grad_norm": 8.390954971313477, "learning_rate": 4.9331789293211026e-08, "loss": 1.4121, "step": 490 }, { "epoch": 19.8, "grad_norm": 8.34197998046875, "learning_rate": 1.233599085671e-08, "loss": 1.4843, "step": 495 }, { "epoch": 20.0, "grad_norm": 7.5700273513793945, "learning_rate": 0.0, "loss": 1.52, "step": 500 }, { "epoch": 20.0, "step": 500, "total_flos": 1.3524244538720256e+16, "train_loss": 2.0708214292526246, "train_runtime": 754.3437, "train_samples_per_second": 10.605, "train_steps_per_second": 0.663 } ], "logging_steps": 5, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 100, "total_flos": 1.3524244538720256e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }