{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0005614783725549371, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.614783725549371e-06, "grad_norm": 1.063336968421936, "learning_rate": 0.0002, "loss": 1.0718, "step": 1 }, { "epoch": 5.614783725549371e-06, "eval_loss": 1.0175169706344604, "eval_runtime": 13561.1744, "eval_samples_per_second": 2.765, "eval_steps_per_second": 2.765, "step": 1 }, { "epoch": 1.1229567451098743e-05, "grad_norm": 0.9254959225654602, "learning_rate": 0.0004, "loss": 1.26, "step": 2 }, { "epoch": 1.6844351176648115e-05, "grad_norm": 0.4091038107872009, "learning_rate": 0.0006, "loss": 0.7047, "step": 3 }, { "epoch": 2.2459134902197485e-05, "grad_norm": 0.8800899982452393, "learning_rate": 0.0008, "loss": 0.9239, "step": 4 }, { "epoch": 2.8073918627746856e-05, "grad_norm": 1.2747443914413452, "learning_rate": 0.001, "loss": 0.7746, "step": 5 }, { "epoch": 3.368870235329623e-05, "grad_norm": 1.434735894203186, "learning_rate": 0.0012, "loss": 1.1616, "step": 6 }, { "epoch": 3.9303486078845604e-05, "grad_norm": 1.1246367692947388, "learning_rate": 0.0014, "loss": 0.7168, "step": 7 }, { "epoch": 4.491826980439497e-05, "grad_norm": 0.9489652514457703, "learning_rate": 0.0016, "loss": 0.9415, "step": 8 }, { "epoch": 5.0533053529944345e-05, "grad_norm": 0.6757056713104248, "learning_rate": 0.0018000000000000002, "loss": 0.6533, "step": 9 }, { "epoch": 5.614783725549371e-05, "grad_norm": 1.3025261163711548, "learning_rate": 0.002, "loss": 0.6712, "step": 10 }, { "epoch": 6.176262098104309e-05, "grad_norm": 1.0577226877212524, "learning_rate": 0.0019993908270190958, "loss": 0.7631, "step": 11 }, { "epoch": 6.737740470659246e-05, "grad_norm": 1.5588256120681763, "learning_rate": 0.0019975640502598244, "loss": 1.1228, "step": 12 }, { "epoch": 7.299218843214183e-05, "grad_norm": 1.1976680755615234, "learning_rate": 0.0019945218953682734, "loss": 0.6171, "step": 13 }, { "epoch": 7.860697215769121e-05, "grad_norm": 1.1977174282073975, "learning_rate": 0.0019902680687415705, "loss": 0.7131, "step": 14 }, { "epoch": 8.422175588324057e-05, "grad_norm": 1.8031384944915771, "learning_rate": 0.001984807753012208, "loss": 1.1394, "step": 15 }, { "epoch": 8.983653960878994e-05, "grad_norm": 2.389751434326172, "learning_rate": 0.0019781476007338056, "loss": 0.822, "step": 16 }, { "epoch": 9.545132333433932e-05, "grad_norm": 2.088827610015869, "learning_rate": 0.0019702957262759963, "loss": 1.0301, "step": 17 }, { "epoch": 0.00010106610705988869, "grad_norm": 1.35056734085083, "learning_rate": 0.001961261695938319, "loss": 0.6762, "step": 18 }, { "epoch": 0.00010668089078543806, "grad_norm": 1.7480477094650269, "learning_rate": 0.0019510565162951536, "loss": 0.9004, "step": 19 }, { "epoch": 0.00011229567451098742, "grad_norm": 1.480393409729004, "learning_rate": 0.0019396926207859084, "loss": 0.9562, "step": 20 }, { "epoch": 0.0001179104582365368, "grad_norm": 1.6566611528396606, "learning_rate": 0.0019271838545667874, "loss": 0.8456, "step": 21 }, { "epoch": 0.00012352524196208617, "grad_norm": 1.7289091348648071, "learning_rate": 0.001913545457642601, "loss": 1.0861, "step": 22 }, { "epoch": 0.00012914002568763553, "grad_norm": 2.511488914489746, "learning_rate": 0.001898794046299167, "loss": 0.9673, "step": 23 }, { "epoch": 0.00013475480941318492, "grad_norm": 2.9465174674987793, "learning_rate": 0.001882947592858927, "loss": 1.0684, "step": 24 }, { "epoch": 0.00014036959313873428, "grad_norm": 2.350576400756836, "learning_rate": 0.001866025403784439, "loss": 1.1737, "step": 25 }, { "epoch": 0.00014036959313873428, "eval_loss": 1.0244402885437012, "eval_runtime": 13567.5847, "eval_samples_per_second": 2.764, "eval_steps_per_second": 2.764, "step": 25 }, { "epoch": 0.00014598437686428367, "grad_norm": 2.190694570541382, "learning_rate": 0.0018480480961564258, "loss": 0.9693, "step": 26 }, { "epoch": 0.00015159916058983303, "grad_norm": 3.3499956130981445, "learning_rate": 0.0018290375725550417, "loss": 1.3869, "step": 27 }, { "epoch": 0.00015721394431538241, "grad_norm": 2.3959763050079346, "learning_rate": 0.0018090169943749475, "loss": 0.9052, "step": 28 }, { "epoch": 0.00016282872804093177, "grad_norm": 1.8362096548080444, "learning_rate": 0.0017880107536067218, "loss": 1.1557, "step": 29 }, { "epoch": 0.00016844351176648113, "grad_norm": 2.4866271018981934, "learning_rate": 0.001766044443118978, "loss": 1.2626, "step": 30 }, { "epoch": 0.00017405829549203052, "grad_norm": 3.59562349319458, "learning_rate": 0.0017431448254773942, "loss": 1.3407, "step": 31 }, { "epoch": 0.00017967307921757988, "grad_norm": 2.424304723739624, "learning_rate": 0.001719339800338651, "loss": 0.9181, "step": 32 }, { "epoch": 0.00018528786294312927, "grad_norm": 3.3184380531311035, "learning_rate": 0.0016946583704589974, "loss": 0.9443, "step": 33 }, { "epoch": 0.00019090264666867863, "grad_norm": 3.005448579788208, "learning_rate": 0.0016691306063588583, "loss": 1.0322, "step": 34 }, { "epoch": 0.000196517430394228, "grad_norm": 1.865719199180603, "learning_rate": 0.0016427876096865393, "loss": 0.8881, "step": 35 }, { "epoch": 0.00020213221411977738, "grad_norm": 1.9284836053848267, "learning_rate": 0.0016156614753256582, "loss": 1.0125, "step": 36 }, { "epoch": 0.00020774699784532674, "grad_norm": 3.410578489303589, "learning_rate": 0.0015877852522924731, "loss": 1.1619, "step": 37 }, { "epoch": 0.00021336178157087613, "grad_norm": 2.3564658164978027, "learning_rate": 0.0015591929034707468, "loss": 1.3581, "step": 38 }, { "epoch": 0.00021897656529642549, "grad_norm": 1.6943787336349487, "learning_rate": 0.001529919264233205, "loss": 0.7817, "step": 39 }, { "epoch": 0.00022459134902197485, "grad_norm": 2.2240307331085205, "learning_rate": 0.0015, "loss": 0.8353, "step": 40 }, { "epoch": 0.00023020613274752423, "grad_norm": 3.0564215183258057, "learning_rate": 0.0014694715627858908, "loss": 1.0998, "step": 41 }, { "epoch": 0.0002358209164730736, "grad_norm": 3.891338348388672, "learning_rate": 0.0014383711467890773, "loss": 1.4342, "step": 42 }, { "epoch": 0.00024143570019862298, "grad_norm": 1.7128349542617798, "learning_rate": 0.0014067366430758003, "loss": 0.9709, "step": 43 }, { "epoch": 0.00024705048392417234, "grad_norm": 4.62891960144043, "learning_rate": 0.0013746065934159121, "loss": 1.0202, "step": 44 }, { "epoch": 0.00025266526764972173, "grad_norm": 4.355927467346191, "learning_rate": 0.0013420201433256688, "loss": 1.0459, "step": 45 }, { "epoch": 0.00025828005137527106, "grad_norm": 2.2983617782592773, "learning_rate": 0.0013090169943749475, "loss": 1.1911, "step": 46 }, { "epoch": 0.00026389483510082045, "grad_norm": 2.7273306846618652, "learning_rate": 0.0012756373558169992, "loss": 0.6804, "step": 47 }, { "epoch": 0.00026950961882636984, "grad_norm": 3.133652687072754, "learning_rate": 0.0012419218955996676, "loss": 0.9479, "step": 48 }, { "epoch": 0.0002751244025519192, "grad_norm": 1.5990813970565796, "learning_rate": 0.0012079116908177592, "loss": 1.1125, "step": 49 }, { "epoch": 0.00028073918627746856, "grad_norm": 3.576906204223633, "learning_rate": 0.0011736481776669307, "loss": 1.1765, "step": 50 }, { "epoch": 0.00028073918627746856, "eval_loss": 1.0542502403259277, "eval_runtime": 13596.4994, "eval_samples_per_second": 2.758, "eval_steps_per_second": 2.758, "step": 50 }, { "epoch": 0.00028635397000301795, "grad_norm": 3.177819013595581, "learning_rate": 0.0011391731009600654, "loss": 1.0826, "step": 51 }, { "epoch": 0.00029196875372856733, "grad_norm": 1.8690358400344849, "learning_rate": 0.0011045284632676536, "loss": 1.1652, "step": 52 }, { "epoch": 0.00029758353745411667, "grad_norm": 2.333109140396118, "learning_rate": 0.0010697564737441251, "loss": 0.9394, "step": 53 }, { "epoch": 0.00030319832117966605, "grad_norm": 2.443849802017212, "learning_rate": 0.0010348994967025011, "loss": 1.1265, "step": 54 }, { "epoch": 0.00030881310490521544, "grad_norm": 2.8769495487213135, "learning_rate": 0.001, "loss": 1.2229, "step": 55 }, { "epoch": 0.00031442788863076483, "grad_norm": 2.619149684906006, "learning_rate": 0.0009651005032974994, "loss": 0.9862, "step": 56 }, { "epoch": 0.00032004267235631416, "grad_norm": 2.4989588260650635, "learning_rate": 0.0009302435262558747, "loss": 1.2905, "step": 57 }, { "epoch": 0.00032565745608186355, "grad_norm": 2.512277841567993, "learning_rate": 0.0008954715367323467, "loss": 1.154, "step": 58 }, { "epoch": 0.00033127223980741294, "grad_norm": 2.4800610542297363, "learning_rate": 0.0008608268990399348, "loss": 1.5995, "step": 59 }, { "epoch": 0.00033688702353296227, "grad_norm": 2.3938980102539062, "learning_rate": 0.0008263518223330697, "loss": 1.319, "step": 60 }, { "epoch": 0.00034250180725851166, "grad_norm": 2.4228010177612305, "learning_rate": 0.0007920883091822408, "loss": 0.8742, "step": 61 }, { "epoch": 0.00034811659098406104, "grad_norm": 2.0796005725860596, "learning_rate": 0.0007580781044003324, "loss": 1.1228, "step": 62 }, { "epoch": 0.0003537313747096104, "grad_norm": 2.852293014526367, "learning_rate": 0.0007243626441830009, "loss": 0.9704, "step": 63 }, { "epoch": 0.00035934615843515977, "grad_norm": 2.3396663665771484, "learning_rate": 0.0006909830056250527, "loss": 0.6046, "step": 64 }, { "epoch": 0.00036496094216070915, "grad_norm": 2.455451488494873, "learning_rate": 0.0006579798566743314, "loss": 1.1228, "step": 65 }, { "epoch": 0.00037057572588625854, "grad_norm": 2.2324576377868652, "learning_rate": 0.0006253934065840879, "loss": 0.8794, "step": 66 }, { "epoch": 0.0003761905096118079, "grad_norm": 1.5327250957489014, "learning_rate": 0.0005932633569242, "loss": 0.9488, "step": 67 }, { "epoch": 0.00038180529333735726, "grad_norm": 1.51426362991333, "learning_rate": 0.0005616288532109225, "loss": 0.5928, "step": 68 }, { "epoch": 0.00038742007706290665, "grad_norm": 2.4975624084472656, "learning_rate": 0.0005305284372141095, "loss": 0.8214, "step": 69 }, { "epoch": 0.000393034860788456, "grad_norm": 2.9134929180145264, "learning_rate": 0.0005000000000000002, "loss": 1.1048, "step": 70 }, { "epoch": 0.00039864964451400537, "grad_norm": 2.5357260704040527, "learning_rate": 0.0004700807357667952, "loss": 1.4895, "step": 71 }, { "epoch": 0.00040426442823955476, "grad_norm": 1.9591279029846191, "learning_rate": 0.0004408070965292533, "loss": 0.8524, "step": 72 }, { "epoch": 0.00040987921196510414, "grad_norm": 1.8926405906677246, "learning_rate": 0.00041221474770752696, "loss": 0.9163, "step": 73 }, { "epoch": 0.0004154939956906535, "grad_norm": 1.8855713605880737, "learning_rate": 0.0003843385246743417, "loss": 1.4346, "step": 74 }, { "epoch": 0.00042110877941620286, "grad_norm": 3.0587925910949707, "learning_rate": 0.0003572123903134606, "loss": 0.9795, "step": 75 }, { "epoch": 0.00042110877941620286, "eval_loss": 0.9440500140190125, "eval_runtime": 13647.0704, "eval_samples_per_second": 2.747, "eval_steps_per_second": 2.747, "step": 75 }, { "epoch": 0.00042672356314175225, "grad_norm": 1.1029833555221558, "learning_rate": 0.0003308693936411421, "loss": 0.8496, "step": 76 }, { "epoch": 0.0004323383468673016, "grad_norm": 1.740774393081665, "learning_rate": 0.00030534162954100265, "loss": 0.8706, "step": 77 }, { "epoch": 0.00043795313059285097, "grad_norm": 1.9898922443389893, "learning_rate": 0.00028066019966134904, "loss": 0.8711, "step": 78 }, { "epoch": 0.00044356791431840036, "grad_norm": 1.1141417026519775, "learning_rate": 0.00025685517452260563, "loss": 0.7094, "step": 79 }, { "epoch": 0.0004491826980439497, "grad_norm": 1.7842293977737427, "learning_rate": 0.0002339555568810221, "loss": 1.0987, "step": 80 }, { "epoch": 0.0004547974817694991, "grad_norm": 1.4790884256362915, "learning_rate": 0.00021198924639327811, "loss": 0.6301, "step": 81 }, { "epoch": 0.00046041226549504847, "grad_norm": 0.9781002998352051, "learning_rate": 0.00019098300562505265, "loss": 0.893, "step": 82 }, { "epoch": 0.00046602704922059786, "grad_norm": 1.3697646856307983, "learning_rate": 0.00017096242744495837, "loss": 0.6985, "step": 83 }, { "epoch": 0.0004716418329461472, "grad_norm": 1.812117576599121, "learning_rate": 0.00015195190384357404, "loss": 1.1954, "step": 84 }, { "epoch": 0.0004772566166716966, "grad_norm": 1.748288631439209, "learning_rate": 0.0001339745962155613, "loss": 0.9258, "step": 85 }, { "epoch": 0.00048287140039724596, "grad_norm": 1.5759660005569458, "learning_rate": 0.00011705240714107302, "loss": 0.8344, "step": 86 }, { "epoch": 0.0004884861841227954, "grad_norm": 1.638961911201477, "learning_rate": 0.00010120595370083318, "loss": 1.0205, "step": 87 }, { "epoch": 0.0004941009678483447, "grad_norm": 1.6769089698791504, "learning_rate": 8.645454235739902e-05, "loss": 0.9673, "step": 88 }, { "epoch": 0.000499715751573894, "grad_norm": 1.4740527868270874, "learning_rate": 7.281614543321269e-05, "loss": 0.5517, "step": 89 }, { "epoch": 0.0005053305352994435, "grad_norm": 1.8537969589233398, "learning_rate": 6.0307379214091684e-05, "loss": 1.0717, "step": 90 }, { "epoch": 0.0005109453190249928, "grad_norm": 1.06834077835083, "learning_rate": 4.894348370484647e-05, "loss": 0.833, "step": 91 }, { "epoch": 0.0005165601027505421, "grad_norm": 2.1590793132781982, "learning_rate": 3.873830406168111e-05, "loss": 1.2292, "step": 92 }, { "epoch": 0.0005221748864760916, "grad_norm": 1.418778419494629, "learning_rate": 2.9704273724003528e-05, "loss": 0.7584, "step": 93 }, { "epoch": 0.0005277896702016409, "grad_norm": 1.332282543182373, "learning_rate": 2.1852399266194312e-05, "loss": 0.8623, "step": 94 }, { "epoch": 0.0005334044539271903, "grad_norm": 1.214484691619873, "learning_rate": 1.519224698779198e-05, "loss": 0.5786, "step": 95 }, { "epoch": 0.0005390192376527397, "grad_norm": 1.5501805543899536, "learning_rate": 9.731931258429638e-06, "loss": 1.0117, "step": 96 }, { "epoch": 0.000544634021378289, "grad_norm": 1.5862345695495605, "learning_rate": 5.47810463172671e-06, "loss": 0.88, "step": 97 }, { "epoch": 0.0005502488051038384, "grad_norm": 1.6535526514053345, "learning_rate": 2.4359497401758024e-06, "loss": 0.6076, "step": 98 }, { "epoch": 0.0005558635888293878, "grad_norm": 0.9868706464767456, "learning_rate": 6.091729809042379e-07, "loss": 0.6062, "step": 99 }, { "epoch": 0.0005614783725549371, "grad_norm": 1.6027213335037231, "learning_rate": 0.0, "loss": 0.56, "step": 100 }, { "epoch": 0.0005614783725549371, "eval_loss": 0.8763332962989807, "eval_runtime": 13589.0012, "eval_samples_per_second": 2.759, "eval_steps_per_second": 2.759, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.70837631753257e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }