{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.612048192771084, "eval_steps": 500, "global_step": 343, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0963855421686747, "grad_norm": 1.7634485960006714, "learning_rate": 4.998814299283415e-05, "loss": 0.8996, "num_input_tokens_seen": 78528, "step": 5 }, { "epoch": 0.1927710843373494, "grad_norm": 1.3068124055862427, "learning_rate": 4.995258321842611e-05, "loss": 0.6806, "num_input_tokens_seen": 159120, "step": 10 }, { "epoch": 0.2891566265060241, "grad_norm": 1.2104840278625488, "learning_rate": 4.989335440737586e-05, "loss": 0.618, "num_input_tokens_seen": 223552, "step": 15 }, { "epoch": 0.3855421686746988, "grad_norm": 1.4112542867660522, "learning_rate": 4.98105127417984e-05, "loss": 0.5594, "num_input_tokens_seen": 290944, "step": 20 }, { "epoch": 0.4819277108433735, "grad_norm": 0.9026587605476379, "learning_rate": 4.9704136802031485e-05, "loss": 0.5253, "num_input_tokens_seen": 364064, "step": 25 }, { "epoch": 0.5783132530120482, "grad_norm": 0.9427546858787537, "learning_rate": 4.957432749209755e-05, "loss": 0.4794, "num_input_tokens_seen": 440176, "step": 30 }, { "epoch": 0.6746987951807228, "grad_norm": 1.0594468116760254, "learning_rate": 4.942120794399002e-05, "loss": 0.4546, "num_input_tokens_seen": 517184, "step": 35 }, { "epoch": 0.7710843373493976, "grad_norm": 0.9458279013633728, "learning_rate": 4.9244923400875245e-05, "loss": 0.4703, "num_input_tokens_seen": 591424, "step": 40 }, { "epoch": 0.8674698795180723, "grad_norm": 1.1610336303710938, "learning_rate": 4.9045641079320484e-05, "loss": 0.4407, "num_input_tokens_seen": 662784, "step": 45 }, { "epoch": 0.963855421686747, "grad_norm": 1.0153354406356812, "learning_rate": 4.882355001067892e-05, "loss": 0.4425, "num_input_tokens_seen": 734784, "step": 50 }, { "epoch": 1.0602409638554218, "grad_norm": 1.0889695882797241, "learning_rate": 4.857886086178194e-05, "loss": 0.4081, "num_input_tokens_seen": 808336, "step": 55 }, { "epoch": 1.1566265060240963, "grad_norm": 0.9168598055839539, "learning_rate": 4.8311805735108894e-05, "loss": 0.4002, "num_input_tokens_seen": 882672, "step": 60 }, { "epoch": 1.2530120481927711, "grad_norm": 0.8168660998344421, "learning_rate": 4.802263794862385e-05, "loss": 0.3587, "num_input_tokens_seen": 947680, "step": 65 }, { "epoch": 1.3493975903614457, "grad_norm": 1.0652003288269043, "learning_rate": 4.7711631795488096e-05, "loss": 0.356, "num_input_tokens_seen": 1022112, "step": 70 }, { "epoch": 1.4457831325301205, "grad_norm": 1.1781517267227173, "learning_rate": 4.7379082283876566e-05, "loss": 0.3639, "num_input_tokens_seen": 1091744, "step": 75 }, { "epoch": 1.5421686746987953, "grad_norm": 1.0550976991653442, "learning_rate": 4.702530485714461e-05, "loss": 0.3288, "num_input_tokens_seen": 1163728, "step": 80 }, { "epoch": 1.6385542168674698, "grad_norm": 1.3946661949157715, "learning_rate": 4.665063509461097e-05, "loss": 0.3563, "num_input_tokens_seen": 1245728, "step": 85 }, { "epoch": 1.7349397590361446, "grad_norm": 1.1458536386489868, "learning_rate": 4.625542839324036e-05, "loss": 0.3642, "num_input_tokens_seen": 1315056, "step": 90 }, { "epoch": 1.8313253012048194, "grad_norm": 1.0227209329605103, "learning_rate": 4.584005963052799e-05, "loss": 0.3407, "num_input_tokens_seen": 1392224, "step": 95 }, { "epoch": 1.927710843373494, "grad_norm": 1.0699985027313232, "learning_rate": 4.540492280890555e-05, "loss": 0.3216, "num_input_tokens_seen": 1471008, "step": 100 }, { "epoch": 2.0240963855421685, "grad_norm": 0.8573477268218994, "learning_rate": 4.4950430682006e-05, "loss": 0.3197, "num_input_tokens_seen": 1546912, "step": 105 }, { "epoch": 2.1204819277108435, "grad_norm": 1.1516242027282715, "learning_rate": 4.447701436314176e-05, "loss": 0.2904, "num_input_tokens_seen": 1611328, "step": 110 }, { "epoch": 2.216867469879518, "grad_norm": 1.0890793800354004, "learning_rate": 4.398512291636768e-05, "loss": 0.2498, "num_input_tokens_seen": 1682528, "step": 115 }, { "epoch": 2.3132530120481927, "grad_norm": 1.3621636629104614, "learning_rate": 4.347522293051648e-05, "loss": 0.269, "num_input_tokens_seen": 1751856, "step": 120 }, { "epoch": 2.4096385542168672, "grad_norm": 1.338083028793335, "learning_rate": 4.294779807661105e-05, "loss": 0.2838, "num_input_tokens_seen": 1830288, "step": 125 }, { "epoch": 2.5060240963855422, "grad_norm": 1.2083592414855957, "learning_rate": 4.2403348649073174e-05, "loss": 0.2466, "num_input_tokens_seen": 1905296, "step": 130 }, { "epoch": 2.602409638554217, "grad_norm": 1.35024094581604, "learning_rate": 4.184239109116393e-05, "loss": 0.2272, "num_input_tokens_seen": 1974464, "step": 135 }, { "epoch": 2.6987951807228914, "grad_norm": 1.3738912343978882, "learning_rate": 4.126545750510605e-05, "loss": 0.2484, "num_input_tokens_seen": 2058176, "step": 140 }, { "epoch": 2.7951807228915664, "grad_norm": 1.5877448320388794, "learning_rate": 4.067309514735267e-05, "loss": 0.2339, "num_input_tokens_seen": 2124912, "step": 145 }, { "epoch": 2.891566265060241, "grad_norm": 1.3735121488571167, "learning_rate": 4.0065865909481417e-05, "loss": 0.2597, "num_input_tokens_seen": 2213456, "step": 150 }, { "epoch": 2.9879518072289155, "grad_norm": 1.6480368375778198, "learning_rate": 3.9444345785206285e-05, "loss": 0.2525, "num_input_tokens_seen": 2281680, "step": 155 }, { "epoch": 3.0843373493975905, "grad_norm": 1.2931358814239502, "learning_rate": 3.880912432401265e-05, "loss": 0.1832, "num_input_tokens_seen": 2349408, "step": 160 }, { "epoch": 3.180722891566265, "grad_norm": 1.4131468534469604, "learning_rate": 3.81608040719339e-05, "loss": 0.1519, "num_input_tokens_seen": 2425456, "step": 165 }, { "epoch": 3.2771084337349397, "grad_norm": 1.6228159666061401, "learning_rate": 3.7500000000000003e-05, "loss": 0.1707, "num_input_tokens_seen": 2494064, "step": 170 }, { "epoch": 3.3734939759036147, "grad_norm": 1.1356842517852783, "learning_rate": 3.6827338920900254e-05, "loss": 0.1603, "num_input_tokens_seen": 2573616, "step": 175 }, { "epoch": 3.4698795180722892, "grad_norm": 1.3535553216934204, "learning_rate": 3.6143458894413465e-05, "loss": 0.1683, "num_input_tokens_seen": 2657744, "step": 180 }, { "epoch": 3.566265060240964, "grad_norm": 1.3832409381866455, "learning_rate": 3.544900862216959e-05, "loss": 0.1734, "num_input_tokens_seen": 2721200, "step": 185 }, { "epoch": 3.662650602409639, "grad_norm": 1.6430705785751343, "learning_rate": 3.474464683231698e-05, "loss": 0.1543, "num_input_tokens_seen": 2798320, "step": 190 }, { "epoch": 3.7590361445783134, "grad_norm": 1.7706836462020874, "learning_rate": 3.403104165467883e-05, "loss": 0.1601, "num_input_tokens_seen": 2879200, "step": 195 }, { "epoch": 3.855421686746988, "grad_norm": 1.7721610069274902, "learning_rate": 3.330886998699149e-05, "loss": 0.1911, "num_input_tokens_seen": 2947024, "step": 200 }, { "epoch": 3.9518072289156625, "grad_norm": 1.666278600692749, "learning_rate": 3.257881685282609e-05, "loss": 0.1741, "num_input_tokens_seen": 3016656, "step": 205 }, { "epoch": 4.048192771084337, "grad_norm": 1.099639892578125, "learning_rate": 3.1841574751802076e-05, "loss": 0.1334, "num_input_tokens_seen": 3084416, "step": 210 }, { "epoch": 4.144578313253012, "grad_norm": 1.5020925998687744, "learning_rate": 3.109784300270943e-05, "loss": 0.1027, "num_input_tokens_seen": 3166784, "step": 215 }, { "epoch": 4.240963855421687, "grad_norm": 2.203794240951538, "learning_rate": 3.0348327080162435e-05, "loss": 0.0955, "num_input_tokens_seen": 3239584, "step": 220 }, { "epoch": 4.337349397590361, "grad_norm": 1.7183223962783813, "learning_rate": 2.9593737945414264e-05, "loss": 0.1006, "num_input_tokens_seen": 3313360, "step": 225 }, { "epoch": 4.433734939759036, "grad_norm": 1.4102908372879028, "learning_rate": 2.8834791371967142e-05, "loss": 0.1007, "num_input_tokens_seen": 3377840, "step": 230 }, { "epoch": 4.530120481927711, "grad_norm": 1.214020013809204, "learning_rate": 2.8072207266617855e-05, "loss": 0.1033, "num_input_tokens_seen": 3455904, "step": 235 }, { "epoch": 4.626506024096385, "grad_norm": 1.5255635976791382, "learning_rate": 2.7306708986582553e-05, "loss": 0.1023, "num_input_tokens_seen": 3529360, "step": 240 }, { "epoch": 4.72289156626506, "grad_norm": 1.6624009609222412, "learning_rate": 2.653902265334858e-05, "loss": 0.1121, "num_input_tokens_seen": 3605344, "step": 245 }, { "epoch": 4.8192771084337345, "grad_norm": 1.7999521493911743, "learning_rate": 2.5769876463904265e-05, "loss": 0.1028, "num_input_tokens_seen": 3678352, "step": 250 }, { "epoch": 4.9156626506024095, "grad_norm": 2.1297786235809326, "learning_rate": 2.5e-05, "loss": 0.1055, "num_input_tokens_seen": 3752608, "step": 255 }, { "epoch": 5.0120481927710845, "grad_norm": 1.215146780014038, "learning_rate": 2.4230123536095748e-05, "loss": 0.1037, "num_input_tokens_seen": 3819744, "step": 260 }, { "epoch": 5.108433734939759, "grad_norm": 1.448801040649414, "learning_rate": 2.346097734665143e-05, "loss": 0.0633, "num_input_tokens_seen": 3896592, "step": 265 }, { "epoch": 5.204819277108434, "grad_norm": 1.220989465713501, "learning_rate": 2.2693291013417453e-05, "loss": 0.0521, "num_input_tokens_seen": 3970976, "step": 270 }, { "epoch": 5.301204819277109, "grad_norm": 1.3077821731567383, "learning_rate": 2.192779273338215e-05, "loss": 0.0625, "num_input_tokens_seen": 4051760, "step": 275 }, { "epoch": 5.397590361445783, "grad_norm": 2.02695369720459, "learning_rate": 2.116520862803286e-05, "loss": 0.059, "num_input_tokens_seen": 4124096, "step": 280 }, { "epoch": 5.493975903614458, "grad_norm": 1.6377320289611816, "learning_rate": 2.0406262054585738e-05, "loss": 0.0648, "num_input_tokens_seen": 4188448, "step": 285 }, { "epoch": 5.590361445783133, "grad_norm": 1.6187361478805542, "learning_rate": 1.965167291983757e-05, "loss": 0.0709, "num_input_tokens_seen": 4261056, "step": 290 }, { "epoch": 5.686746987951807, "grad_norm": 1.4855268001556396, "learning_rate": 1.890215699729057e-05, "loss": 0.0641, "num_input_tokens_seen": 4329024, "step": 295 }, { "epoch": 5.783132530120482, "grad_norm": 1.4216831922531128, "learning_rate": 1.815842524819793e-05, "loss": 0.0689, "num_input_tokens_seen": 4406624, "step": 300 }, { "epoch": 5.879518072289157, "grad_norm": 1.7383759021759033, "learning_rate": 1.7421183147173915e-05, "loss": 0.055, "num_input_tokens_seen": 4480352, "step": 305 }, { "epoch": 5.975903614457831, "grad_norm": 1.5599803924560547, "learning_rate": 1.6691130013008514e-05, "loss": 0.0626, "num_input_tokens_seen": 4554080, "step": 310 }, { "epoch": 6.072289156626506, "grad_norm": 1.028124213218689, "learning_rate": 1.5968958345321178e-05, "loss": 0.0465, "num_input_tokens_seen": 4628576, "step": 315 }, { "epoch": 6.168674698795181, "grad_norm": 1.4686311483383179, "learning_rate": 1.5255353167683017e-05, "loss": 0.0421, "num_input_tokens_seen": 4704512, "step": 320 }, { "epoch": 6.265060240963855, "grad_norm": 1.1644634008407593, "learning_rate": 1.4550991377830426e-05, "loss": 0.0303, "num_input_tokens_seen": 4776912, "step": 325 }, { "epoch": 6.36144578313253, "grad_norm": 1.090997338294983, "learning_rate": 1.3856541105586545e-05, "loss": 0.0337, "num_input_tokens_seen": 4855600, "step": 330 }, { "epoch": 6.457831325301205, "grad_norm": 1.4336110353469849, "learning_rate": 1.3172661079099752e-05, "loss": 0.0333, "num_input_tokens_seen": 4927600, "step": 335 }, { "epoch": 6.554216867469879, "grad_norm": 1.3488271236419678, "learning_rate": 1.2500000000000006e-05, "loss": 0.039, "num_input_tokens_seen": 5003184, "step": 340 }, { "epoch": 6.612048192771084, "num_input_tokens_seen": 5052448, "step": 343, "total_flos": 2.28822660837802e+17, "train_loss": 0.2300173058541106, "train_runtime": 12675.9679, "train_samples_per_second": 0.655, "train_steps_per_second": 0.04 } ], "logging_steps": 5, "max_steps": 510, "num_input_tokens_seen": 5052448, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.28822660837802e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }