|
{ |
|
"best_metric": 1.3523932695388794, |
|
"best_model_checkpoint": "checkpoints/sft_2/checkpoint-2739", |
|
"epoch": 7.504109589041096, |
|
"eval_steps": 913, |
|
"global_step": 2739, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10136986301369863, |
|
"grad_norm": 18.251895904541016, |
|
"learning_rate": 5.068493150684932e-07, |
|
"loss": 2.6033, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.20273972602739726, |
|
"grad_norm": 18.156850814819336, |
|
"learning_rate": 1.0136986301369864e-06, |
|
"loss": 2.4493, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3041095890410959, |
|
"grad_norm": 16.538768768310547, |
|
"learning_rate": 1.5205479452054797e-06, |
|
"loss": 2.1375, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.4054794520547945, |
|
"grad_norm": 4.952304840087891, |
|
"learning_rate": 2.027397260273973e-06, |
|
"loss": 1.7107, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5068493150684932, |
|
"grad_norm": 3.9061036109924316, |
|
"learning_rate": 2.534246575342466e-06, |
|
"loss": 1.5491, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6082191780821918, |
|
"grad_norm": 4.238989353179932, |
|
"learning_rate": 3.0410958904109593e-06, |
|
"loss": 1.4969, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.7095890410958904, |
|
"grad_norm": 3.786147356033325, |
|
"learning_rate": 3.5479452054794523e-06, |
|
"loss": 1.4779, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.810958904109589, |
|
"grad_norm": 3.699124813079834, |
|
"learning_rate": 4.054794520547946e-06, |
|
"loss": 1.4554, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.9123287671232877, |
|
"grad_norm": 3.877596855163574, |
|
"learning_rate": 4.561643835616439e-06, |
|
"loss": 1.4529, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.0136986301369864, |
|
"grad_norm": 5.006152153015137, |
|
"learning_rate": 4.999971418949206e-06, |
|
"loss": 1.4465, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.115068493150685, |
|
"grad_norm": 3.469430923461914, |
|
"learning_rate": 4.997983588332731e-06, |
|
"loss": 1.4214, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.2164383561643834, |
|
"grad_norm": 3.827889919281006, |
|
"learning_rate": 4.992868406108372e-06, |
|
"loss": 1.4312, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.3178082191780822, |
|
"grad_norm": 3.8270199298858643, |
|
"learning_rate": 4.9846322762306745e-06, |
|
"loss": 1.4214, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.4191780821917808, |
|
"grad_norm": 3.366734266281128, |
|
"learning_rate": 4.973285509925916e-06, |
|
"loss": 1.4377, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.5205479452054793, |
|
"grad_norm": 3.5708305835723877, |
|
"learning_rate": 4.958842312782962e-06, |
|
"loss": 1.4188, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.621917808219178, |
|
"grad_norm": 3.532292366027832, |
|
"learning_rate": 4.94132076696857e-06, |
|
"loss": 1.4017, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.7232876712328768, |
|
"grad_norm": 4.288147449493408, |
|
"learning_rate": 4.920742808589422e-06, |
|
"loss": 1.4026, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.8246575342465754, |
|
"grad_norm": 3.560300350189209, |
|
"learning_rate": 4.897134200229196e-06, |
|
"loss": 1.4065, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.926027397260274, |
|
"grad_norm": 3.666012763977051, |
|
"learning_rate": 4.870524498695093e-06, |
|
"loss": 1.4041, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 2.0273972602739727, |
|
"grad_norm": 4.007699966430664, |
|
"learning_rate": 4.8409470180141825e-06, |
|
"loss": 1.3843, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.128767123287671, |
|
"grad_norm": 3.6797378063201904, |
|
"learning_rate": 4.808438787725889e-06, |
|
"loss": 1.405, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 2.23013698630137, |
|
"grad_norm": 4.038980960845947, |
|
"learning_rate": 4.773040506522845e-06, |
|
"loss": 1.3826, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 2.3315068493150686, |
|
"grad_norm": 3.8665153980255127, |
|
"learning_rate": 4.734796491298143e-06, |
|
"loss": 1.3791, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 2.432876712328767, |
|
"grad_norm": 4.490111827850342, |
|
"learning_rate": 4.693754621662789e-06, |
|
"loss": 1.3699, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 2.5013698630136987, |
|
"eval_loss": 1.387344479560852, |
|
"eval_runtime": 41.7812, |
|
"eval_samples_per_second": 23.934, |
|
"eval_steps_per_second": 2.992, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 2.5342465753424657, |
|
"grad_norm": 4.025441646575928, |
|
"learning_rate": 4.649966280002798e-06, |
|
"loss": 1.3657, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.6356164383561644, |
|
"grad_norm": 4.344162940979004, |
|
"learning_rate": 4.6034862871509954e-06, |
|
"loss": 1.3728, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 2.736986301369863, |
|
"grad_norm": 3.900259494781494, |
|
"learning_rate": 4.5543728337540524e-06, |
|
"loss": 1.3747, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 2.8383561643835615, |
|
"grad_norm": 4.291511058807373, |
|
"learning_rate": 4.502687407420681e-06, |
|
"loss": 1.3742, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 2.9397260273972603, |
|
"grad_norm": 3.9363627433776855, |
|
"learning_rate": 4.4484947157421985e-06, |
|
"loss": 1.3765, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 3.041095890410959, |
|
"grad_norm": 4.16279411315918, |
|
"learning_rate": 4.391862605281827e-06, |
|
"loss": 1.3606, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.1424657534246574, |
|
"grad_norm": 4.134684085845947, |
|
"learning_rate": 4.332861976634164e-06, |
|
"loss": 1.3434, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 3.243835616438356, |
|
"grad_norm": 4.688838958740234, |
|
"learning_rate": 4.27156669566115e-06, |
|
"loss": 1.3553, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 3.345205479452055, |
|
"grad_norm": 4.097309589385986, |
|
"learning_rate": 4.208053501015674e-06, |
|
"loss": 1.3692, |
|
"step": 1221 |
|
}, |
|
{ |
|
"epoch": 3.4465753424657533, |
|
"grad_norm": 4.752201080322266, |
|
"learning_rate": 4.142401908068583e-06, |
|
"loss": 1.3528, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 3.547945205479452, |
|
"grad_norm": 4.456752777099609, |
|
"learning_rate": 4.0746941093593815e-06, |
|
"loss": 1.3494, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 3.649315068493151, |
|
"grad_norm": 4.169310092926025, |
|
"learning_rate": 4.005014871695243e-06, |
|
"loss": 1.3572, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 3.750684931506849, |
|
"grad_norm": 4.615598678588867, |
|
"learning_rate": 3.933451430027176e-06, |
|
"loss": 1.3404, |
|
"step": 1369 |
|
}, |
|
{ |
|
"epoch": 3.852054794520548, |
|
"grad_norm": 4.378635406494141, |
|
"learning_rate": 3.8600933782361875e-06, |
|
"loss": 1.3494, |
|
"step": 1406 |
|
}, |
|
{ |
|
"epoch": 3.9534246575342467, |
|
"grad_norm": 4.689962387084961, |
|
"learning_rate": 3.78503255696618e-06, |
|
"loss": 1.3388, |
|
"step": 1443 |
|
}, |
|
{ |
|
"epoch": 4.054794520547945, |
|
"grad_norm": 4.6829071044921875, |
|
"learning_rate": 3.7083629386440304e-06, |
|
"loss": 1.3513, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.156164383561644, |
|
"grad_norm": 4.488067626953125, |
|
"learning_rate": 3.6301805098307614e-06, |
|
"loss": 1.3284, |
|
"step": 1517 |
|
}, |
|
{ |
|
"epoch": 4.257534246575342, |
|
"grad_norm": 4.579895496368408, |
|
"learning_rate": 3.5505831510511272e-06, |
|
"loss": 1.3345, |
|
"step": 1554 |
|
}, |
|
{ |
|
"epoch": 4.358904109589041, |
|
"grad_norm": 4.769827365875244, |
|
"learning_rate": 3.4696705142520537e-06, |
|
"loss": 1.3316, |
|
"step": 1591 |
|
}, |
|
{ |
|
"epoch": 4.46027397260274, |
|
"grad_norm": 4.928876876831055, |
|
"learning_rate": 3.3875438980433367e-06, |
|
"loss": 1.3359, |
|
"step": 1628 |
|
}, |
|
{ |
|
"epoch": 4.561643835616438, |
|
"grad_norm": 4.699948310852051, |
|
"learning_rate": 3.3043061208768075e-06, |
|
"loss": 1.3425, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 4.663013698630137, |
|
"grad_norm": 4.816317558288574, |
|
"learning_rate": 3.2200613923227255e-06, |
|
"loss": 1.3236, |
|
"step": 1702 |
|
}, |
|
{ |
|
"epoch": 4.764383561643836, |
|
"grad_norm": 5.371675491333008, |
|
"learning_rate": 3.134915182604566e-06, |
|
"loss": 1.3372, |
|
"step": 1739 |
|
}, |
|
{ |
|
"epoch": 4.865753424657534, |
|
"grad_norm": 5.1571455001831055, |
|
"learning_rate": 3.0489740905555297e-06, |
|
"loss": 1.3102, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 4.967123287671233, |
|
"grad_norm": 4.910325050354004, |
|
"learning_rate": 2.9623457101620844e-06, |
|
"loss": 1.319, |
|
"step": 1813 |
|
}, |
|
{ |
|
"epoch": 5.002739726027397, |
|
"eval_loss": 1.3639509677886963, |
|
"eval_runtime": 41.8029, |
|
"eval_samples_per_second": 23.922, |
|
"eval_steps_per_second": 2.99, |
|
"step": 1826 |
|
}, |
|
{ |
|
"epoch": 5.068493150684931, |
|
"grad_norm": 5.355637073516846, |
|
"learning_rate": 2.8751384958616318e-06, |
|
"loss": 1.3268, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.16986301369863, |
|
"grad_norm": 5.53813362121582, |
|
"learning_rate": 2.787461626762929e-06, |
|
"loss": 1.2971, |
|
"step": 1887 |
|
}, |
|
{ |
|
"epoch": 5.271232876712329, |
|
"grad_norm": 5.353805065155029, |
|
"learning_rate": 2.6994248699592545e-06, |
|
"loss": 1.323, |
|
"step": 1924 |
|
}, |
|
{ |
|
"epoch": 5.372602739726028, |
|
"grad_norm": 5.467613220214844, |
|
"learning_rate": 2.611138443105452e-06, |
|
"loss": 1.2996, |
|
"step": 1961 |
|
}, |
|
{ |
|
"epoch": 5.473972602739726, |
|
"grad_norm": 5.071502685546875, |
|
"learning_rate": 2.5227128764308887e-06, |
|
"loss": 1.3153, |
|
"step": 1998 |
|
}, |
|
{ |
|
"epoch": 5.575342465753424, |
|
"grad_norm": 5.327346324920654, |
|
"learning_rate": 2.4342588743610904e-06, |
|
"loss": 1.3218, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 5.676712328767123, |
|
"grad_norm": 5.512831211090088, |
|
"learning_rate": 2.345887176921286e-06, |
|
"loss": 1.3151, |
|
"step": 2072 |
|
}, |
|
{ |
|
"epoch": 5.778082191780822, |
|
"grad_norm": 5.303826332092285, |
|
"learning_rate": 2.257708421095391e-06, |
|
"loss": 1.31, |
|
"step": 2109 |
|
}, |
|
{ |
|
"epoch": 5.879452054794521, |
|
"grad_norm": 5.505942344665527, |
|
"learning_rate": 2.1698330023139837e-06, |
|
"loss": 1.3062, |
|
"step": 2146 |
|
}, |
|
{ |
|
"epoch": 5.980821917808219, |
|
"grad_norm": 5.343113422393799, |
|
"learning_rate": 2.0823709362447025e-06, |
|
"loss": 1.2966, |
|
"step": 2183 |
|
}, |
|
{ |
|
"epoch": 6.082191780821918, |
|
"grad_norm": 5.3861541748046875, |
|
"learning_rate": 1.995431721058082e-06, |
|
"loss": 1.2911, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 6.183561643835616, |
|
"grad_norm": 5.535419464111328, |
|
"learning_rate": 1.909124200341277e-06, |
|
"loss": 1.3043, |
|
"step": 2257 |
|
}, |
|
{ |
|
"epoch": 6.284931506849315, |
|
"grad_norm": 6.381133556365967, |
|
"learning_rate": 1.82355642683128e-06, |
|
"loss": 1.2911, |
|
"step": 2294 |
|
}, |
|
{ |
|
"epoch": 6.3863013698630136, |
|
"grad_norm": 5.855857849121094, |
|
"learning_rate": 1.7388355271382565e-06, |
|
"loss": 1.2897, |
|
"step": 2331 |
|
}, |
|
{ |
|
"epoch": 6.487671232876712, |
|
"grad_norm": 6.03603982925415, |
|
"learning_rate": 1.6550675676283428e-06, |
|
"loss": 1.2915, |
|
"step": 2368 |
|
}, |
|
{ |
|
"epoch": 6.589041095890411, |
|
"grad_norm": 6.094645977020264, |
|
"learning_rate": 1.5723574216338066e-06, |
|
"loss": 1.3027, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 6.69041095890411, |
|
"grad_norm": 5.70314359664917, |
|
"learning_rate": 1.4908086381568398e-06, |
|
"loss": 1.3001, |
|
"step": 2442 |
|
}, |
|
{ |
|
"epoch": 6.791780821917808, |
|
"grad_norm": 6.103550434112549, |
|
"learning_rate": 1.4105233122313416e-06, |
|
"loss": 1.3084, |
|
"step": 2479 |
|
}, |
|
{ |
|
"epoch": 6.8931506849315065, |
|
"grad_norm": 6.549497127532959, |
|
"learning_rate": 1.331601957104995e-06, |
|
"loss": 1.2865, |
|
"step": 2516 |
|
}, |
|
{ |
|
"epoch": 6.994520547945205, |
|
"grad_norm": 5.893728256225586, |
|
"learning_rate": 1.2541433784016639e-06, |
|
"loss": 1.2883, |
|
"step": 2553 |
|
}, |
|
{ |
|
"epoch": 7.095890410958904, |
|
"grad_norm": 6.355903148651123, |
|
"learning_rate": 1.1782445504216552e-06, |
|
"loss": 1.285, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 7.197260273972603, |
|
"grad_norm": 6.0843729972839355, |
|
"learning_rate": 1.1040004947346974e-06, |
|
"loss": 1.2681, |
|
"step": 2627 |
|
}, |
|
{ |
|
"epoch": 7.298630136986302, |
|
"grad_norm": 6.470739841461182, |
|
"learning_rate": 1.0315041612176476e-06, |
|
"loss": 1.281, |
|
"step": 2664 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 5.7129950523376465, |
|
"learning_rate": 9.608463116858544e-07, |
|
"loss": 1.3001, |
|
"step": 2701 |
|
}, |
|
{ |
|
"epoch": 7.501369863013698, |
|
"grad_norm": 6.322762489318848, |
|
"learning_rate": 8.921154062638679e-07, |
|
"loss": 1.2816, |
|
"step": 2738 |
|
}, |
|
{ |
|
"epoch": 7.504109589041096, |
|
"eval_loss": 1.3523932695388794, |
|
"eval_runtime": 41.8059, |
|
"eval_samples_per_second": 23.92, |
|
"eval_steps_per_second": 2.99, |
|
"step": 2739 |
|
} |
|
], |
|
"logging_steps": 37, |
|
"max_steps": 3650, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 913, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.607336348896461e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|