zephyr-7b-sft-full-10ep / trainer_state.json
vipinkatara's picture
Model save
27c296b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 230,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.043478260869565216,
"grad_norm": 175.07723361554483,
"learning_rate": 8.695652173913044e-07,
"loss": 1.9054,
"step": 1
},
{
"epoch": 0.21739130434782608,
"grad_norm": 62.962929820729734,
"learning_rate": 4.347826086956522e-06,
"loss": 1.4576,
"step": 5
},
{
"epoch": 0.43478260869565216,
"grad_norm": 8.648281883873826,
"learning_rate": 8.695652173913044e-06,
"loss": 0.6132,
"step": 10
},
{
"epoch": 0.6521739130434783,
"grad_norm": 16.239431113601718,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.3115,
"step": 15
},
{
"epoch": 0.8695652173913043,
"grad_norm": 3.450808311744288,
"learning_rate": 1.739130434782609e-05,
"loss": 0.203,
"step": 20
},
{
"epoch": 1.0,
"eval_loss": 0.10863249748945236,
"eval_runtime": 67.8349,
"eval_samples_per_second": 21.169,
"eval_steps_per_second": 0.663,
"step": 23
},
{
"epoch": 1.0869565217391304,
"grad_norm": 34.3749757818615,
"learning_rate": 1.9995393663024054e-05,
"loss": 0.1339,
"step": 25
},
{
"epoch": 1.3043478260869565,
"grad_norm": 2.3841429135265555,
"learning_rate": 1.9943621095573588e-05,
"loss": 0.1249,
"step": 30
},
{
"epoch": 1.5217391304347827,
"grad_norm": 1.2759644316371075,
"learning_rate": 1.9834617016337424e-05,
"loss": 0.088,
"step": 35
},
{
"epoch": 1.7391304347826086,
"grad_norm": 34.92728113964099,
"learning_rate": 1.9669008809262064e-05,
"loss": 0.0829,
"step": 40
},
{
"epoch": 1.9565217391304348,
"grad_norm": 2.0028289380489945,
"learning_rate": 1.944774964904754e-05,
"loss": 0.0942,
"step": 45
},
{
"epoch": 2.0,
"eval_loss": 0.07020141929388046,
"eval_runtime": 67.1251,
"eval_samples_per_second": 21.393,
"eval_steps_per_second": 0.67,
"step": 46
},
{
"epoch": 2.1739130434782608,
"grad_norm": 1.3438276529586066,
"learning_rate": 1.917211301505453e-05,
"loss": 0.0674,
"step": 50
},
{
"epoch": 2.391304347826087,
"grad_norm": 1142.7739612838905,
"learning_rate": 1.8843685361665724e-05,
"loss": 0.0668,
"step": 55
},
{
"epoch": 2.608695652173913,
"grad_norm": 2.660687233406025,
"learning_rate": 1.8464356987288012e-05,
"loss": 0.065,
"step": 60
},
{
"epoch": 2.8260869565217392,
"grad_norm": 6.723586538182285,
"learning_rate": 1.8036311154549783e-05,
"loss": 0.064,
"step": 65
},
{
"epoch": 3.0,
"eval_loss": 0.057872917503118515,
"eval_runtime": 67.6569,
"eval_samples_per_second": 21.225,
"eval_steps_per_second": 0.665,
"step": 69
},
{
"epoch": 3.0434782608695654,
"grad_norm": 0.7827050845032839,
"learning_rate": 1.7562011524313187e-05,
"loss": 0.0604,
"step": 70
},
{
"epoch": 3.260869565217391,
"grad_norm": 1.2639368123822572,
"learning_rate": 1.7044187975826126e-05,
"loss": 0.0755,
"step": 75
},
{
"epoch": 3.4782608695652173,
"grad_norm": 0.9047997047917037,
"learning_rate": 1.648582089462756e-05,
"loss": 0.0569,
"step": 80
},
{
"epoch": 3.6956521739130435,
"grad_norm": 0.5428178712369748,
"learning_rate": 1.589012401863864e-05,
"loss": 0.0577,
"step": 85
},
{
"epoch": 3.9130434782608696,
"grad_norm": 0.6450527453899985,
"learning_rate": 1.526052594117071e-05,
"loss": 0.055,
"step": 90
},
{
"epoch": 4.0,
"eval_loss": 0.05387277901172638,
"eval_runtime": 67.296,
"eval_samples_per_second": 21.339,
"eval_steps_per_second": 0.669,
"step": 92
},
{
"epoch": 4.130434782608695,
"grad_norm": 0.6167289429330038,
"learning_rate": 1.4600650377311523e-05,
"loss": 0.0549,
"step": 95
},
{
"epoch": 4.3478260869565215,
"grad_norm": 0.5274268025844417,
"learning_rate": 1.3914295307268396e-05,
"loss": 0.0535,
"step": 100
},
{
"epoch": 4.565217391304348,
"grad_norm": 4.645394192055986,
"learning_rate": 1.3205411116710973e-05,
"loss": 0.0558,
"step": 105
},
{
"epoch": 4.782608695652174,
"grad_norm": 0.44016634132559596,
"learning_rate": 1.2478077859929e-05,
"loss": 0.0557,
"step": 110
},
{
"epoch": 5.0,
"grad_norm": 0.37871781752696215,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.0533,
"step": 115
},
{
"epoch": 5.0,
"eval_loss": 0.05291759595274925,
"eval_runtime": 67.622,
"eval_samples_per_second": 21.236,
"eval_steps_per_second": 0.665,
"step": 115
},
{
"epoch": 5.217391304347826,
"grad_norm": 0.4605215085179153,
"learning_rate": 1.0984891197811686e-05,
"loss": 0.0526,
"step": 120
},
{
"epoch": 5.434782608695652,
"grad_norm": 0.5874492251340868,
"learning_rate": 1.0227631978561057e-05,
"loss": 0.0531,
"step": 125
},
{
"epoch": 5.6521739130434785,
"grad_norm": 0.29510087532653284,
"learning_rate": 9.469062600552509e-06,
"loss": 0.0525,
"step": 130
},
{
"epoch": 5.869565217391305,
"grad_norm": 0.344906161076209,
"learning_rate": 8.71354908617169e-06,
"loss": 0.0525,
"step": 135
},
{
"epoch": 6.0,
"eval_loss": 0.0515441857278347,
"eval_runtime": 67.5553,
"eval_samples_per_second": 21.257,
"eval_steps_per_second": 0.666,
"step": 138
},
{
"epoch": 6.086956521739131,
"grad_norm": 0.36354029610859134,
"learning_rate": 7.965439869473664e-06,
"loss": 0.0514,
"step": 140
},
{
"epoch": 6.304347826086957,
"grad_norm": 0.26428382375759113,
"learning_rate": 7.2290407683331154e-06,
"loss": 0.0515,
"step": 145
},
{
"epoch": 6.521739130434782,
"grad_norm": 0.22822648732081055,
"learning_rate": 6.508590201876317e-06,
"loss": 0.0512,
"step": 150
},
{
"epoch": 6.739130434782608,
"grad_norm": 0.25828737593751994,
"learning_rate": 5.8082347958333625e-06,
"loss": 0.0513,
"step": 155
},
{
"epoch": 6.956521739130435,
"grad_norm": 0.2285799063924952,
"learning_rate": 5.132005516216512e-06,
"loss": 0.0519,
"step": 160
},
{
"epoch": 7.0,
"eval_loss": 0.05051277205348015,
"eval_runtime": 67.5123,
"eval_samples_per_second": 21.27,
"eval_steps_per_second": 0.667,
"step": 161
},
{
"epoch": 7.173913043478261,
"grad_norm": 0.24475626828923558,
"learning_rate": 4.483794468689728e-06,
"loss": 0.0508,
"step": 165
},
{
"epoch": 7.391304347826087,
"grad_norm": 0.2520072015362384,
"learning_rate": 3.867332497162836e-06,
"loss": 0.0504,
"step": 170
},
{
"epoch": 7.608695652173913,
"grad_norm": 0.27382520169651525,
"learning_rate": 3.2861677105440335e-06,
"loss": 0.0504,
"step": 175
},
{
"epoch": 7.826086956521739,
"grad_norm": 0.2716287514584957,
"learning_rate": 2.7436450612420098e-06,
"loss": 0.0505,
"step": 180
},
{
"epoch": 8.0,
"eval_loss": 0.04957514628767967,
"eval_runtime": 67.276,
"eval_samples_per_second": 21.345,
"eval_steps_per_second": 0.669,
"step": 184
},
{
"epoch": 8.043478260869565,
"grad_norm": 0.21787711573464694,
"learning_rate": 2.2428870929558012e-06,
"loss": 0.0496,
"step": 185
},
{
"epoch": 8.26086956521739,
"grad_norm": 0.21445383159071862,
"learning_rate": 1.7867759685603115e-06,
"loss": 0.0494,
"step": 190
},
{
"epoch": 8.478260869565217,
"grad_norm": 0.24405861737397047,
"learning_rate": 1.3779368815278648e-06,
"loss": 0.0491,
"step": 195
},
{
"epoch": 8.695652173913043,
"grad_norm": 0.2697463930100617,
"learning_rate": 1.01872294636304e-06,
"loss": 0.0495,
"step": 200
},
{
"epoch": 8.91304347826087,
"grad_norm": 0.21112878071345867,
"learning_rate": 7.1120165501533e-07,
"loss": 0.0494,
"step": 205
},
{
"epoch": 9.0,
"eval_loss": 0.048730239272117615,
"eval_runtime": 66.913,
"eval_samples_per_second": 21.461,
"eval_steps_per_second": 0.673,
"step": 207
},
{
"epoch": 9.130434782608695,
"grad_norm": 0.19010353057716406,
"learning_rate": 4.5714297722121105e-07,
"loss": 0.049,
"step": 210
},
{
"epoch": 9.347826086956522,
"grad_norm": 0.17948234694281104,
"learning_rate": 2.5800917326521013e-07,
"loss": 0.0491,
"step": 215
},
{
"epoch": 9.565217391304348,
"grad_norm": 0.19343675188279383,
"learning_rate": 1.1494637779369766e-07,
"loss": 0.0489,
"step": 220
},
{
"epoch": 9.782608695652174,
"grad_norm": 0.21072621537133224,
"learning_rate": 2.8778003121607834e-08,
"loss": 0.0483,
"step": 225
},
{
"epoch": 10.0,
"grad_norm": 0.18309652115048786,
"learning_rate": 0.0,
"loss": 0.0484,
"step": 230
},
{
"epoch": 10.0,
"eval_loss": 0.0484623983502388,
"eval_runtime": 67.5159,
"eval_samples_per_second": 21.269,
"eval_steps_per_second": 0.667,
"step": 230
},
{
"epoch": 10.0,
"step": 230,
"total_flos": 48157320806400.0,
"train_loss": 0.1130801611620447,
"train_runtime": 3471.4646,
"train_samples_per_second": 4.137,
"train_steps_per_second": 0.066
}
],
"logging_steps": 5,
"max_steps": 230,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"total_flos": 48157320806400.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}