sft_2 / trainer_state.json
BrokenSki8's picture
Upload folder using huggingface_hub
de981df verified
raw
history blame contribute delete
No virus
14.2 kB
{
"best_metric": 1.3523932695388794,
"best_model_checkpoint": "checkpoints/sft_2/checkpoint-2739",
"epoch": 7.504109589041096,
"eval_steps": 913,
"global_step": 2739,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.10136986301369863,
"grad_norm": 18.251895904541016,
"learning_rate": 5.068493150684932e-07,
"loss": 2.6033,
"step": 37
},
{
"epoch": 0.20273972602739726,
"grad_norm": 18.156850814819336,
"learning_rate": 1.0136986301369864e-06,
"loss": 2.4493,
"step": 74
},
{
"epoch": 0.3041095890410959,
"grad_norm": 16.538768768310547,
"learning_rate": 1.5205479452054797e-06,
"loss": 2.1375,
"step": 111
},
{
"epoch": 0.4054794520547945,
"grad_norm": 4.952304840087891,
"learning_rate": 2.027397260273973e-06,
"loss": 1.7107,
"step": 148
},
{
"epoch": 0.5068493150684932,
"grad_norm": 3.9061036109924316,
"learning_rate": 2.534246575342466e-06,
"loss": 1.5491,
"step": 185
},
{
"epoch": 0.6082191780821918,
"grad_norm": 4.238989353179932,
"learning_rate": 3.0410958904109593e-06,
"loss": 1.4969,
"step": 222
},
{
"epoch": 0.7095890410958904,
"grad_norm": 3.786147356033325,
"learning_rate": 3.5479452054794523e-06,
"loss": 1.4779,
"step": 259
},
{
"epoch": 0.810958904109589,
"grad_norm": 3.699124813079834,
"learning_rate": 4.054794520547946e-06,
"loss": 1.4554,
"step": 296
},
{
"epoch": 0.9123287671232877,
"grad_norm": 3.877596855163574,
"learning_rate": 4.561643835616439e-06,
"loss": 1.4529,
"step": 333
},
{
"epoch": 1.0136986301369864,
"grad_norm": 5.006152153015137,
"learning_rate": 4.999971418949206e-06,
"loss": 1.4465,
"step": 370
},
{
"epoch": 1.115068493150685,
"grad_norm": 3.469430923461914,
"learning_rate": 4.997983588332731e-06,
"loss": 1.4214,
"step": 407
},
{
"epoch": 1.2164383561643834,
"grad_norm": 3.827889919281006,
"learning_rate": 4.992868406108372e-06,
"loss": 1.4312,
"step": 444
},
{
"epoch": 1.3178082191780822,
"grad_norm": 3.8270199298858643,
"learning_rate": 4.9846322762306745e-06,
"loss": 1.4214,
"step": 481
},
{
"epoch": 1.4191780821917808,
"grad_norm": 3.366734266281128,
"learning_rate": 4.973285509925916e-06,
"loss": 1.4377,
"step": 518
},
{
"epoch": 1.5205479452054793,
"grad_norm": 3.5708305835723877,
"learning_rate": 4.958842312782962e-06,
"loss": 1.4188,
"step": 555
},
{
"epoch": 1.621917808219178,
"grad_norm": 3.532292366027832,
"learning_rate": 4.94132076696857e-06,
"loss": 1.4017,
"step": 592
},
{
"epoch": 1.7232876712328768,
"grad_norm": 4.288147449493408,
"learning_rate": 4.920742808589422e-06,
"loss": 1.4026,
"step": 629
},
{
"epoch": 1.8246575342465754,
"grad_norm": 3.560300350189209,
"learning_rate": 4.897134200229196e-06,
"loss": 1.4065,
"step": 666
},
{
"epoch": 1.926027397260274,
"grad_norm": 3.666012763977051,
"learning_rate": 4.870524498695093e-06,
"loss": 1.4041,
"step": 703
},
{
"epoch": 2.0273972602739727,
"grad_norm": 4.007699966430664,
"learning_rate": 4.8409470180141825e-06,
"loss": 1.3843,
"step": 740
},
{
"epoch": 2.128767123287671,
"grad_norm": 3.6797378063201904,
"learning_rate": 4.808438787725889e-06,
"loss": 1.405,
"step": 777
},
{
"epoch": 2.23013698630137,
"grad_norm": 4.038980960845947,
"learning_rate": 4.773040506522845e-06,
"loss": 1.3826,
"step": 814
},
{
"epoch": 2.3315068493150686,
"grad_norm": 3.8665153980255127,
"learning_rate": 4.734796491298143e-06,
"loss": 1.3791,
"step": 851
},
{
"epoch": 2.432876712328767,
"grad_norm": 4.490111827850342,
"learning_rate": 4.693754621662789e-06,
"loss": 1.3699,
"step": 888
},
{
"epoch": 2.5013698630136987,
"eval_loss": 1.387344479560852,
"eval_runtime": 41.7812,
"eval_samples_per_second": 23.934,
"eval_steps_per_second": 2.992,
"step": 913
},
{
"epoch": 2.5342465753424657,
"grad_norm": 4.025441646575928,
"learning_rate": 4.649966280002798e-06,
"loss": 1.3657,
"step": 925
},
{
"epoch": 2.6356164383561644,
"grad_norm": 4.344162940979004,
"learning_rate": 4.6034862871509954e-06,
"loss": 1.3728,
"step": 962
},
{
"epoch": 2.736986301369863,
"grad_norm": 3.900259494781494,
"learning_rate": 4.5543728337540524e-06,
"loss": 1.3747,
"step": 999
},
{
"epoch": 2.8383561643835615,
"grad_norm": 4.291511058807373,
"learning_rate": 4.502687407420681e-06,
"loss": 1.3742,
"step": 1036
},
{
"epoch": 2.9397260273972603,
"grad_norm": 3.9363627433776855,
"learning_rate": 4.4484947157421985e-06,
"loss": 1.3765,
"step": 1073
},
{
"epoch": 3.041095890410959,
"grad_norm": 4.16279411315918,
"learning_rate": 4.391862605281827e-06,
"loss": 1.3606,
"step": 1110
},
{
"epoch": 3.1424657534246574,
"grad_norm": 4.134684085845947,
"learning_rate": 4.332861976634164e-06,
"loss": 1.3434,
"step": 1147
},
{
"epoch": 3.243835616438356,
"grad_norm": 4.688838958740234,
"learning_rate": 4.27156669566115e-06,
"loss": 1.3553,
"step": 1184
},
{
"epoch": 3.345205479452055,
"grad_norm": 4.097309589385986,
"learning_rate": 4.208053501015674e-06,
"loss": 1.3692,
"step": 1221
},
{
"epoch": 3.4465753424657533,
"grad_norm": 4.752201080322266,
"learning_rate": 4.142401908068583e-06,
"loss": 1.3528,
"step": 1258
},
{
"epoch": 3.547945205479452,
"grad_norm": 4.456752777099609,
"learning_rate": 4.0746941093593815e-06,
"loss": 1.3494,
"step": 1295
},
{
"epoch": 3.649315068493151,
"grad_norm": 4.169310092926025,
"learning_rate": 4.005014871695243e-06,
"loss": 1.3572,
"step": 1332
},
{
"epoch": 3.750684931506849,
"grad_norm": 4.615598678588867,
"learning_rate": 3.933451430027176e-06,
"loss": 1.3404,
"step": 1369
},
{
"epoch": 3.852054794520548,
"grad_norm": 4.378635406494141,
"learning_rate": 3.8600933782361875e-06,
"loss": 1.3494,
"step": 1406
},
{
"epoch": 3.9534246575342467,
"grad_norm": 4.689962387084961,
"learning_rate": 3.78503255696618e-06,
"loss": 1.3388,
"step": 1443
},
{
"epoch": 4.054794520547945,
"grad_norm": 4.6829071044921875,
"learning_rate": 3.7083629386440304e-06,
"loss": 1.3513,
"step": 1480
},
{
"epoch": 4.156164383561644,
"grad_norm": 4.488067626953125,
"learning_rate": 3.6301805098307614e-06,
"loss": 1.3284,
"step": 1517
},
{
"epoch": 4.257534246575342,
"grad_norm": 4.579895496368408,
"learning_rate": 3.5505831510511272e-06,
"loss": 1.3345,
"step": 1554
},
{
"epoch": 4.358904109589041,
"grad_norm": 4.769827365875244,
"learning_rate": 3.4696705142520537e-06,
"loss": 1.3316,
"step": 1591
},
{
"epoch": 4.46027397260274,
"grad_norm": 4.928876876831055,
"learning_rate": 3.3875438980433367e-06,
"loss": 1.3359,
"step": 1628
},
{
"epoch": 4.561643835616438,
"grad_norm": 4.699948310852051,
"learning_rate": 3.3043061208768075e-06,
"loss": 1.3425,
"step": 1665
},
{
"epoch": 4.663013698630137,
"grad_norm": 4.816317558288574,
"learning_rate": 3.2200613923227255e-06,
"loss": 1.3236,
"step": 1702
},
{
"epoch": 4.764383561643836,
"grad_norm": 5.371675491333008,
"learning_rate": 3.134915182604566e-06,
"loss": 1.3372,
"step": 1739
},
{
"epoch": 4.865753424657534,
"grad_norm": 5.1571455001831055,
"learning_rate": 3.0489740905555297e-06,
"loss": 1.3102,
"step": 1776
},
{
"epoch": 4.967123287671233,
"grad_norm": 4.910325050354004,
"learning_rate": 2.9623457101620844e-06,
"loss": 1.319,
"step": 1813
},
{
"epoch": 5.002739726027397,
"eval_loss": 1.3639509677886963,
"eval_runtime": 41.8029,
"eval_samples_per_second": 23.922,
"eval_steps_per_second": 2.99,
"step": 1826
},
{
"epoch": 5.068493150684931,
"grad_norm": 5.355637073516846,
"learning_rate": 2.8751384958616318e-06,
"loss": 1.3268,
"step": 1850
},
{
"epoch": 5.16986301369863,
"grad_norm": 5.53813362121582,
"learning_rate": 2.787461626762929e-06,
"loss": 1.2971,
"step": 1887
},
{
"epoch": 5.271232876712329,
"grad_norm": 5.353805065155029,
"learning_rate": 2.6994248699592545e-06,
"loss": 1.323,
"step": 1924
},
{
"epoch": 5.372602739726028,
"grad_norm": 5.467613220214844,
"learning_rate": 2.611138443105452e-06,
"loss": 1.2996,
"step": 1961
},
{
"epoch": 5.473972602739726,
"grad_norm": 5.071502685546875,
"learning_rate": 2.5227128764308887e-06,
"loss": 1.3153,
"step": 1998
},
{
"epoch": 5.575342465753424,
"grad_norm": 5.327346324920654,
"learning_rate": 2.4342588743610904e-06,
"loss": 1.3218,
"step": 2035
},
{
"epoch": 5.676712328767123,
"grad_norm": 5.512831211090088,
"learning_rate": 2.345887176921286e-06,
"loss": 1.3151,
"step": 2072
},
{
"epoch": 5.778082191780822,
"grad_norm": 5.303826332092285,
"learning_rate": 2.257708421095391e-06,
"loss": 1.31,
"step": 2109
},
{
"epoch": 5.879452054794521,
"grad_norm": 5.505942344665527,
"learning_rate": 2.1698330023139837e-06,
"loss": 1.3062,
"step": 2146
},
{
"epoch": 5.980821917808219,
"grad_norm": 5.343113422393799,
"learning_rate": 2.0823709362447025e-06,
"loss": 1.2966,
"step": 2183
},
{
"epoch": 6.082191780821918,
"grad_norm": 5.3861541748046875,
"learning_rate": 1.995431721058082e-06,
"loss": 1.2911,
"step": 2220
},
{
"epoch": 6.183561643835616,
"grad_norm": 5.535419464111328,
"learning_rate": 1.909124200341277e-06,
"loss": 1.3043,
"step": 2257
},
{
"epoch": 6.284931506849315,
"grad_norm": 6.381133556365967,
"learning_rate": 1.82355642683128e-06,
"loss": 1.2911,
"step": 2294
},
{
"epoch": 6.3863013698630136,
"grad_norm": 5.855857849121094,
"learning_rate": 1.7388355271382565e-06,
"loss": 1.2897,
"step": 2331
},
{
"epoch": 6.487671232876712,
"grad_norm": 6.03603982925415,
"learning_rate": 1.6550675676283428e-06,
"loss": 1.2915,
"step": 2368
},
{
"epoch": 6.589041095890411,
"grad_norm": 6.094645977020264,
"learning_rate": 1.5723574216338066e-06,
"loss": 1.3027,
"step": 2405
},
{
"epoch": 6.69041095890411,
"grad_norm": 5.70314359664917,
"learning_rate": 1.4908086381568398e-06,
"loss": 1.3001,
"step": 2442
},
{
"epoch": 6.791780821917808,
"grad_norm": 6.103550434112549,
"learning_rate": 1.4105233122313416e-06,
"loss": 1.3084,
"step": 2479
},
{
"epoch": 6.8931506849315065,
"grad_norm": 6.549497127532959,
"learning_rate": 1.331601957104995e-06,
"loss": 1.2865,
"step": 2516
},
{
"epoch": 6.994520547945205,
"grad_norm": 5.893728256225586,
"learning_rate": 1.2541433784016639e-06,
"loss": 1.2883,
"step": 2553
},
{
"epoch": 7.095890410958904,
"grad_norm": 6.355903148651123,
"learning_rate": 1.1782445504216552e-06,
"loss": 1.285,
"step": 2590
},
{
"epoch": 7.197260273972603,
"grad_norm": 6.0843729972839355,
"learning_rate": 1.1040004947346974e-06,
"loss": 1.2681,
"step": 2627
},
{
"epoch": 7.298630136986302,
"grad_norm": 6.470739841461182,
"learning_rate": 1.0315041612176476e-06,
"loss": 1.281,
"step": 2664
},
{
"epoch": 7.4,
"grad_norm": 5.7129950523376465,
"learning_rate": 9.608463116858544e-07,
"loss": 1.3001,
"step": 2701
},
{
"epoch": 7.501369863013698,
"grad_norm": 6.322762489318848,
"learning_rate": 8.921154062638679e-07,
"loss": 1.2816,
"step": 2738
},
{
"epoch": 7.504109589041096,
"eval_loss": 1.3523932695388794,
"eval_runtime": 41.8059,
"eval_samples_per_second": 23.92,
"eval_steps_per_second": 2.99,
"step": 2739
}
],
"logging_steps": 37,
"max_steps": 3650,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 913,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.607336348896461e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}