{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25252525252525254, "eval_steps": 38, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003367003367003367, "grad_norm": 2.0, "learning_rate": 3.5714285714285716e-07, "loss": 1.0833, "step": 1 }, { "epoch": 0.003367003367003367, "eval_loss": 1.032954454421997, "eval_runtime": 8.7744, "eval_samples_per_second": 56.984, "eval_steps_per_second": 3.647, "step": 1 }, { "epoch": 0.006734006734006734, "grad_norm": 2.265625, "learning_rate": 7.142857142857143e-07, "loss": 0.996, "step": 2 }, { "epoch": 0.010101010101010102, "grad_norm": 1.640625, "learning_rate": 1.0714285714285714e-06, "loss": 1.0487, "step": 3 }, { "epoch": 0.013468013468013467, "grad_norm": 2.046875, "learning_rate": 1.4285714285714286e-06, "loss": 1.0553, "step": 4 }, { "epoch": 0.016835016835016835, "grad_norm": 2.453125, "learning_rate": 1.7857142857142859e-06, "loss": 1.0339, "step": 5 }, { "epoch": 0.020202020202020204, "grad_norm": 1.984375, "learning_rate": 2.1428571428571427e-06, "loss": 1.014, "step": 6 }, { "epoch": 0.02356902356902357, "grad_norm": 2.609375, "learning_rate": 2.5e-06, "loss": 1.0298, "step": 7 }, { "epoch": 0.026936026936026935, "grad_norm": 2.328125, "learning_rate": 2.8571428571428573e-06, "loss": 1.0226, "step": 8 }, { "epoch": 0.030303030303030304, "grad_norm": 2.25, "learning_rate": 3.2142857142857147e-06, "loss": 1.0739, "step": 9 }, { "epoch": 0.03367003367003367, "grad_norm": 2.28125, "learning_rate": 3.5714285714285718e-06, "loss": 1.0224, "step": 10 }, { "epoch": 0.037037037037037035, "grad_norm": 2.4375, "learning_rate": 3.928571428571429e-06, "loss": 1.0089, "step": 11 }, { "epoch": 0.04040404040404041, "grad_norm": 2.03125, "learning_rate": 4.2857142857142855e-06, "loss": 1.0529, "step": 12 }, { "epoch": 0.04377104377104377, "grad_norm": 2.03125, "learning_rate": 4.642857142857144e-06, "loss": 1.068, "step": 13 }, { "epoch": 0.04713804713804714, "grad_norm": 1.7265625, "learning_rate": 5e-06, "loss": 1.0318, "step": 14 }, { "epoch": 0.050505050505050504, "grad_norm": 1.8359375, "learning_rate": 4.9998459603839726e-06, "loss": 1.0439, "step": 15 }, { "epoch": 0.05387205387205387, "grad_norm": 2.15625, "learning_rate": 4.9993838605184505e-06, "loss": 1.0647, "step": 16 }, { "epoch": 0.05723905723905724, "grad_norm": 1.6015625, "learning_rate": 4.998613757348784e-06, "loss": 1.0081, "step": 17 }, { "epoch": 0.06060606060606061, "grad_norm": 1.65625, "learning_rate": 4.99753574577609e-06, "loss": 1.0098, "step": 18 }, { "epoch": 0.06397306397306397, "grad_norm": 1.4140625, "learning_rate": 4.996149958645559e-06, "loss": 0.9822, "step": 19 }, { "epoch": 0.06734006734006734, "grad_norm": 1.375, "learning_rate": 4.994456566730085e-06, "loss": 1.0474, "step": 20 }, { "epoch": 0.0707070707070707, "grad_norm": 1.2265625, "learning_rate": 4.992455778709222e-06, "loss": 1.0421, "step": 21 }, { "epoch": 0.07407407407407407, "grad_norm": 1.4609375, "learning_rate": 4.990147841143462e-06, "loss": 0.9845, "step": 22 }, { "epoch": 0.07744107744107744, "grad_norm": 1.2421875, "learning_rate": 4.98753303844386e-06, "loss": 0.9773, "step": 23 }, { "epoch": 0.08080808080808081, "grad_norm": 1.4296875, "learning_rate": 4.984611692836979e-06, "loss": 1.0511, "step": 24 }, { "epoch": 0.08417508417508418, "grad_norm": 1.1875, "learning_rate": 4.981384164325184e-06, "loss": 0.9915, "step": 25 }, { "epoch": 0.08754208754208755, "grad_norm": 1.15625, "learning_rate": 4.977850850642275e-06, "loss": 0.9881, "step": 26 }, { "epoch": 0.09090909090909091, "grad_norm": 1.296875, "learning_rate": 4.97401218720448e-06, "loss": 1.0623, "step": 27 }, { "epoch": 0.09427609427609428, "grad_norm": 1.2734375, "learning_rate": 4.969868647056793e-06, "loss": 1.0282, "step": 28 }, { "epoch": 0.09764309764309764, "grad_norm": 1.125, "learning_rate": 4.965420740814679e-06, "loss": 1.0122, "step": 29 }, { "epoch": 0.10101010101010101, "grad_norm": 1.25, "learning_rate": 4.960669016601155e-06, "loss": 1.0342, "step": 30 }, { "epoch": 0.10437710437710437, "grad_norm": 1.0625, "learning_rate": 4.95561405997924e-06, "loss": 0.9911, "step": 31 }, { "epoch": 0.10774410774410774, "grad_norm": 1.21875, "learning_rate": 4.950256493879795e-06, "loss": 1.025, "step": 32 }, { "epoch": 0.1111111111111111, "grad_norm": 1.1328125, "learning_rate": 4.94459697852476e-06, "loss": 1.022, "step": 33 }, { "epoch": 0.11447811447811448, "grad_norm": 1.1953125, "learning_rate": 4.938636211345792e-06, "loss": 1.0085, "step": 34 }, { "epoch": 0.11784511784511785, "grad_norm": 1.1484375, "learning_rate": 4.932374926898321e-06, "loss": 1.0109, "step": 35 }, { "epoch": 0.12121212121212122, "grad_norm": 1.0390625, "learning_rate": 4.92581389677103e-06, "loss": 0.9654, "step": 36 }, { "epoch": 0.12457912457912458, "grad_norm": 1.109375, "learning_rate": 4.918953929490768e-06, "loss": 1.0041, "step": 37 }, { "epoch": 0.12794612794612795, "grad_norm": 1.09375, "learning_rate": 4.911795870422916e-06, "loss": 1.0118, "step": 38 }, { "epoch": 0.12794612794612795, "eval_loss": 0.9947459697723389, "eval_runtime": 8.7184, "eval_samples_per_second": 57.35, "eval_steps_per_second": 3.67, "step": 38 }, { "epoch": 0.13131313131313133, "grad_norm": 0.9765625, "learning_rate": 4.904340601667208e-06, "loss": 0.9841, "step": 39 }, { "epoch": 0.13468013468013468, "grad_norm": 1.046875, "learning_rate": 4.896589041949036e-06, "loss": 1.0055, "step": 40 }, { "epoch": 0.13804713804713806, "grad_norm": 1.171875, "learning_rate": 4.888542146506224e-06, "loss": 0.9728, "step": 41 }, { "epoch": 0.1414141414141414, "grad_norm": 0.9765625, "learning_rate": 4.880200906971321e-06, "loss": 0.975, "step": 42 }, { "epoch": 0.1447811447811448, "grad_norm": 1.0234375, "learning_rate": 4.8715663512493924e-06, "loss": 0.9628, "step": 43 }, { "epoch": 0.14814814814814814, "grad_norm": 0.92578125, "learning_rate": 4.8626395433913595e-06, "loss": 0.9668, "step": 44 }, { "epoch": 0.15151515151515152, "grad_norm": 1.078125, "learning_rate": 4.853421583462866e-06, "loss": 0.9539, "step": 45 }, { "epoch": 0.15488215488215487, "grad_norm": 1.1484375, "learning_rate": 4.8439136074087165e-06, "loss": 1.0145, "step": 46 }, { "epoch": 0.15824915824915825, "grad_norm": 1.015625, "learning_rate": 4.834116786912897e-06, "loss": 0.9738, "step": 47 }, { "epoch": 0.16161616161616163, "grad_norm": 1.0390625, "learning_rate": 4.82403232925418e-06, "loss": 0.9907, "step": 48 }, { "epoch": 0.16498316498316498, "grad_norm": 1.0546875, "learning_rate": 4.813661477157355e-06, "loss": 0.9773, "step": 49 }, { "epoch": 0.16835016835016836, "grad_norm": 1.015625, "learning_rate": 4.803005508640083e-06, "loss": 0.9636, "step": 50 }, { "epoch": 0.1717171717171717, "grad_norm": 0.9375, "learning_rate": 4.7920657368554e-06, "loss": 0.9589, "step": 51 }, { "epoch": 0.1750841750841751, "grad_norm": 0.9765625, "learning_rate": 4.780843509929905e-06, "loss": 0.9753, "step": 52 }, { "epoch": 0.17845117845117844, "grad_norm": 1.015625, "learning_rate": 4.769340210797618e-06, "loss": 0.9324, "step": 53 }, { "epoch": 0.18181818181818182, "grad_norm": 0.96875, "learning_rate": 4.757557257029563e-06, "loss": 0.9439, "step": 54 }, { "epoch": 0.18518518518518517, "grad_norm": 1.0078125, "learning_rate": 4.745496100659083e-06, "loss": 0.9925, "step": 55 }, { "epoch": 0.18855218855218855, "grad_norm": 0.8828125, "learning_rate": 4.733158228002891e-06, "loss": 0.9294, "step": 56 }, { "epoch": 0.1919191919191919, "grad_norm": 0.95703125, "learning_rate": 4.720545159477921e-06, "loss": 0.9502, "step": 57 }, { "epoch": 0.19528619528619529, "grad_norm": 0.90234375, "learning_rate": 4.707658449413961e-06, "loss": 0.9538, "step": 58 }, { "epoch": 0.19865319865319866, "grad_norm": 1.03125, "learning_rate": 4.694499685862106e-06, "loss": 0.9484, "step": 59 }, { "epoch": 0.20202020202020202, "grad_norm": 1.0390625, "learning_rate": 4.681070490399064e-06, "loss": 0.9515, "step": 60 }, { "epoch": 0.2053872053872054, "grad_norm": 0.921875, "learning_rate": 4.667372517927323e-06, "loss": 0.956, "step": 61 }, { "epoch": 0.20875420875420875, "grad_norm": 0.88671875, "learning_rate": 4.653407456471222e-06, "loss": 0.9478, "step": 62 }, { "epoch": 0.21212121212121213, "grad_norm": 1.0390625, "learning_rate": 4.639177026968924e-06, "loss": 0.9427, "step": 63 }, { "epoch": 0.21548821548821548, "grad_norm": 0.9296875, "learning_rate": 4.624682983060346e-06, "loss": 1.0028, "step": 64 }, { "epoch": 0.21885521885521886, "grad_norm": 0.8828125, "learning_rate": 4.609927110871053e-06, "loss": 0.9442, "step": 65 }, { "epoch": 0.2222222222222222, "grad_norm": 0.890625, "learning_rate": 4.594911228792156e-06, "loss": 0.9695, "step": 66 }, { "epoch": 0.2255892255892256, "grad_norm": 0.94140625, "learning_rate": 4.579637187256222e-06, "loss": 0.9407, "step": 67 }, { "epoch": 0.22895622895622897, "grad_norm": 0.9609375, "learning_rate": 4.564106868509246e-06, "loss": 1.0014, "step": 68 }, { "epoch": 0.23232323232323232, "grad_norm": 0.9140625, "learning_rate": 4.5483221863786965e-06, "loss": 0.9249, "step": 69 }, { "epoch": 0.2356902356902357, "grad_norm": 0.83984375, "learning_rate": 4.5322850860376744e-06, "loss": 0.9623, "step": 70 }, { "epoch": 0.23905723905723905, "grad_norm": 0.9765625, "learning_rate": 4.515997543765202e-06, "loss": 0.9321, "step": 71 }, { "epoch": 0.24242424242424243, "grad_norm": 0.94921875, "learning_rate": 4.499461566702685e-06, "loss": 0.9407, "step": 72 }, { "epoch": 0.24579124579124578, "grad_norm": 0.86328125, "learning_rate": 4.48267919260657e-06, "loss": 0.9081, "step": 73 }, { "epoch": 0.24915824915824916, "grad_norm": 0.87890625, "learning_rate": 4.465652489597226e-06, "loss": 0.9259, "step": 74 }, { "epoch": 0.25252525252525254, "grad_norm": 0.984375, "learning_rate": 4.4483835559040885e-06, "loss": 0.9386, "step": 75 } ], "logging_steps": 1, "max_steps": 297, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 75, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.36437672443904e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }