tot_llama / checkpoint-500 /trainer_state.json
sallywww's picture
First model version
a47a2e1
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.806032970887408,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11,
"grad_norm": 0.5909375548362732,
"learning_rate": 1.9932584269662923e-05,
"loss": 2.0237,
"step": 20
},
{
"epoch": 0.22,
"grad_norm": 0.5826025009155273,
"learning_rate": 1.9857677902621722e-05,
"loss": 1.9306,
"step": 40
},
{
"epoch": 0.34,
"grad_norm": 0.5491089820861816,
"learning_rate": 1.9782771535580525e-05,
"loss": 1.7959,
"step": 60
},
{
"epoch": 0.45,
"grad_norm": 1.362810730934143,
"learning_rate": 1.970786516853933e-05,
"loss": 1.6599,
"step": 80
},
{
"epoch": 0.56,
"grad_norm": 1.4427486658096313,
"learning_rate": 1.963295880149813e-05,
"loss": 1.5685,
"step": 100
},
{
"epoch": 0.67,
"grad_norm": 0.9993659257888794,
"learning_rate": 1.956179775280899e-05,
"loss": 1.4621,
"step": 120
},
{
"epoch": 0.79,
"grad_norm": 1.614562749862671,
"learning_rate": 1.9486891385767793e-05,
"loss": 1.31,
"step": 140
},
{
"epoch": 0.9,
"grad_norm": 1.1975798606872559,
"learning_rate": 1.9411985018726593e-05,
"loss": 1.2322,
"step": 160
},
{
"epoch": 1.01,
"grad_norm": 0.7684128880500793,
"learning_rate": 1.9337078651685396e-05,
"loss": 1.1361,
"step": 180
},
{
"epoch": 1.12,
"grad_norm": 0.9336960911750793,
"learning_rate": 1.9262172284644195e-05,
"loss": 1.0797,
"step": 200
},
{
"epoch": 1.23,
"grad_norm": 0.8471770882606506,
"learning_rate": 1.9187265917603e-05,
"loss": 1.0368,
"step": 220
},
{
"epoch": 1.35,
"grad_norm": 1.111340045928955,
"learning_rate": 1.9112359550561798e-05,
"loss": 0.9738,
"step": 240
},
{
"epoch": 1.46,
"grad_norm": 0.8093781471252441,
"learning_rate": 1.90374531835206e-05,
"loss": 0.9494,
"step": 260
},
{
"epoch": 1.57,
"grad_norm": 0.8438062071800232,
"learning_rate": 1.89625468164794e-05,
"loss": 0.9276,
"step": 280
},
{
"epoch": 1.68,
"grad_norm": 0.9896701574325562,
"learning_rate": 1.8887640449438204e-05,
"loss": 0.8656,
"step": 300
},
{
"epoch": 1.8,
"grad_norm": 0.8278244137763977,
"learning_rate": 1.8812734082397007e-05,
"loss": 0.8431,
"step": 320
},
{
"epoch": 1.91,
"grad_norm": 0.931291937828064,
"learning_rate": 1.8737827715355807e-05,
"loss": 0.7945,
"step": 340
},
{
"epoch": 2.02,
"grad_norm": 1.21769380569458,
"learning_rate": 1.866292134831461e-05,
"loss": 0.7647,
"step": 360
},
{
"epoch": 2.13,
"grad_norm": 3.5183286666870117,
"learning_rate": 1.858801498127341e-05,
"loss": 0.7497,
"step": 380
},
{
"epoch": 2.24,
"grad_norm": 1.1153030395507812,
"learning_rate": 1.8513108614232212e-05,
"loss": 0.7507,
"step": 400
},
{
"epoch": 2.36,
"grad_norm": 1.0140526294708252,
"learning_rate": 1.8438202247191012e-05,
"loss": 0.7415,
"step": 420
},
{
"epoch": 2.47,
"grad_norm": 1.4395232200622559,
"learning_rate": 1.8363295880149815e-05,
"loss": 0.6947,
"step": 440
},
{
"epoch": 2.58,
"grad_norm": 1.4253089427947998,
"learning_rate": 1.8288389513108615e-05,
"loss": 0.7429,
"step": 460
},
{
"epoch": 2.69,
"grad_norm": 1.3152351379394531,
"learning_rate": 1.8213483146067418e-05,
"loss": 0.7363,
"step": 480
},
{
"epoch": 2.81,
"grad_norm": 2.5935957431793213,
"learning_rate": 1.8138576779026217e-05,
"loss": 0.6486,
"step": 500
}
],
"logging_steps": 20,
"max_steps": 5340,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"total_flos": 1.2995638935552e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}