JudgePierce / trainer_state.json
noxneural's picture
First Commit
c1d7f70 verified
raw
history blame contribute delete
No virus
15.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.612048192771084,
"eval_steps": 500,
"global_step": 343,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0963855421686747,
"grad_norm": 1.7634485960006714,
"learning_rate": 4.998814299283415e-05,
"loss": 0.8996,
"num_input_tokens_seen": 78528,
"step": 5
},
{
"epoch": 0.1927710843373494,
"grad_norm": 1.3068124055862427,
"learning_rate": 4.995258321842611e-05,
"loss": 0.6806,
"num_input_tokens_seen": 159120,
"step": 10
},
{
"epoch": 0.2891566265060241,
"grad_norm": 1.2104840278625488,
"learning_rate": 4.989335440737586e-05,
"loss": 0.618,
"num_input_tokens_seen": 223552,
"step": 15
},
{
"epoch": 0.3855421686746988,
"grad_norm": 1.4112542867660522,
"learning_rate": 4.98105127417984e-05,
"loss": 0.5594,
"num_input_tokens_seen": 290944,
"step": 20
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.9026587605476379,
"learning_rate": 4.9704136802031485e-05,
"loss": 0.5253,
"num_input_tokens_seen": 364064,
"step": 25
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.9427546858787537,
"learning_rate": 4.957432749209755e-05,
"loss": 0.4794,
"num_input_tokens_seen": 440176,
"step": 30
},
{
"epoch": 0.6746987951807228,
"grad_norm": 1.0594468116760254,
"learning_rate": 4.942120794399002e-05,
"loss": 0.4546,
"num_input_tokens_seen": 517184,
"step": 35
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.9458279013633728,
"learning_rate": 4.9244923400875245e-05,
"loss": 0.4703,
"num_input_tokens_seen": 591424,
"step": 40
},
{
"epoch": 0.8674698795180723,
"grad_norm": 1.1610336303710938,
"learning_rate": 4.9045641079320484e-05,
"loss": 0.4407,
"num_input_tokens_seen": 662784,
"step": 45
},
{
"epoch": 0.963855421686747,
"grad_norm": 1.0153354406356812,
"learning_rate": 4.882355001067892e-05,
"loss": 0.4425,
"num_input_tokens_seen": 734784,
"step": 50
},
{
"epoch": 1.0602409638554218,
"grad_norm": 1.0889695882797241,
"learning_rate": 4.857886086178194e-05,
"loss": 0.4081,
"num_input_tokens_seen": 808336,
"step": 55
},
{
"epoch": 1.1566265060240963,
"grad_norm": 0.9168598055839539,
"learning_rate": 4.8311805735108894e-05,
"loss": 0.4002,
"num_input_tokens_seen": 882672,
"step": 60
},
{
"epoch": 1.2530120481927711,
"grad_norm": 0.8168660998344421,
"learning_rate": 4.802263794862385e-05,
"loss": 0.3587,
"num_input_tokens_seen": 947680,
"step": 65
},
{
"epoch": 1.3493975903614457,
"grad_norm": 1.0652003288269043,
"learning_rate": 4.7711631795488096e-05,
"loss": 0.356,
"num_input_tokens_seen": 1022112,
"step": 70
},
{
"epoch": 1.4457831325301205,
"grad_norm": 1.1781517267227173,
"learning_rate": 4.7379082283876566e-05,
"loss": 0.3639,
"num_input_tokens_seen": 1091744,
"step": 75
},
{
"epoch": 1.5421686746987953,
"grad_norm": 1.0550976991653442,
"learning_rate": 4.702530485714461e-05,
"loss": 0.3288,
"num_input_tokens_seen": 1163728,
"step": 80
},
{
"epoch": 1.6385542168674698,
"grad_norm": 1.3946661949157715,
"learning_rate": 4.665063509461097e-05,
"loss": 0.3563,
"num_input_tokens_seen": 1245728,
"step": 85
},
{
"epoch": 1.7349397590361446,
"grad_norm": 1.1458536386489868,
"learning_rate": 4.625542839324036e-05,
"loss": 0.3642,
"num_input_tokens_seen": 1315056,
"step": 90
},
{
"epoch": 1.8313253012048194,
"grad_norm": 1.0227209329605103,
"learning_rate": 4.584005963052799e-05,
"loss": 0.3407,
"num_input_tokens_seen": 1392224,
"step": 95
},
{
"epoch": 1.927710843373494,
"grad_norm": 1.0699985027313232,
"learning_rate": 4.540492280890555e-05,
"loss": 0.3216,
"num_input_tokens_seen": 1471008,
"step": 100
},
{
"epoch": 2.0240963855421685,
"grad_norm": 0.8573477268218994,
"learning_rate": 4.4950430682006e-05,
"loss": 0.3197,
"num_input_tokens_seen": 1546912,
"step": 105
},
{
"epoch": 2.1204819277108435,
"grad_norm": 1.1516242027282715,
"learning_rate": 4.447701436314176e-05,
"loss": 0.2904,
"num_input_tokens_seen": 1611328,
"step": 110
},
{
"epoch": 2.216867469879518,
"grad_norm": 1.0890793800354004,
"learning_rate": 4.398512291636768e-05,
"loss": 0.2498,
"num_input_tokens_seen": 1682528,
"step": 115
},
{
"epoch": 2.3132530120481927,
"grad_norm": 1.3621636629104614,
"learning_rate": 4.347522293051648e-05,
"loss": 0.269,
"num_input_tokens_seen": 1751856,
"step": 120
},
{
"epoch": 2.4096385542168672,
"grad_norm": 1.338083028793335,
"learning_rate": 4.294779807661105e-05,
"loss": 0.2838,
"num_input_tokens_seen": 1830288,
"step": 125
},
{
"epoch": 2.5060240963855422,
"grad_norm": 1.2083592414855957,
"learning_rate": 4.2403348649073174e-05,
"loss": 0.2466,
"num_input_tokens_seen": 1905296,
"step": 130
},
{
"epoch": 2.602409638554217,
"grad_norm": 1.35024094581604,
"learning_rate": 4.184239109116393e-05,
"loss": 0.2272,
"num_input_tokens_seen": 1974464,
"step": 135
},
{
"epoch": 2.6987951807228914,
"grad_norm": 1.3738912343978882,
"learning_rate": 4.126545750510605e-05,
"loss": 0.2484,
"num_input_tokens_seen": 2058176,
"step": 140
},
{
"epoch": 2.7951807228915664,
"grad_norm": 1.5877448320388794,
"learning_rate": 4.067309514735267e-05,
"loss": 0.2339,
"num_input_tokens_seen": 2124912,
"step": 145
},
{
"epoch": 2.891566265060241,
"grad_norm": 1.3735121488571167,
"learning_rate": 4.0065865909481417e-05,
"loss": 0.2597,
"num_input_tokens_seen": 2213456,
"step": 150
},
{
"epoch": 2.9879518072289155,
"grad_norm": 1.6480368375778198,
"learning_rate": 3.9444345785206285e-05,
"loss": 0.2525,
"num_input_tokens_seen": 2281680,
"step": 155
},
{
"epoch": 3.0843373493975905,
"grad_norm": 1.2931358814239502,
"learning_rate": 3.880912432401265e-05,
"loss": 0.1832,
"num_input_tokens_seen": 2349408,
"step": 160
},
{
"epoch": 3.180722891566265,
"grad_norm": 1.4131468534469604,
"learning_rate": 3.81608040719339e-05,
"loss": 0.1519,
"num_input_tokens_seen": 2425456,
"step": 165
},
{
"epoch": 3.2771084337349397,
"grad_norm": 1.6228159666061401,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.1707,
"num_input_tokens_seen": 2494064,
"step": 170
},
{
"epoch": 3.3734939759036147,
"grad_norm": 1.1356842517852783,
"learning_rate": 3.6827338920900254e-05,
"loss": 0.1603,
"num_input_tokens_seen": 2573616,
"step": 175
},
{
"epoch": 3.4698795180722892,
"grad_norm": 1.3535553216934204,
"learning_rate": 3.6143458894413465e-05,
"loss": 0.1683,
"num_input_tokens_seen": 2657744,
"step": 180
},
{
"epoch": 3.566265060240964,
"grad_norm": 1.3832409381866455,
"learning_rate": 3.544900862216959e-05,
"loss": 0.1734,
"num_input_tokens_seen": 2721200,
"step": 185
},
{
"epoch": 3.662650602409639,
"grad_norm": 1.6430705785751343,
"learning_rate": 3.474464683231698e-05,
"loss": 0.1543,
"num_input_tokens_seen": 2798320,
"step": 190
},
{
"epoch": 3.7590361445783134,
"grad_norm": 1.7706836462020874,
"learning_rate": 3.403104165467883e-05,
"loss": 0.1601,
"num_input_tokens_seen": 2879200,
"step": 195
},
{
"epoch": 3.855421686746988,
"grad_norm": 1.7721610069274902,
"learning_rate": 3.330886998699149e-05,
"loss": 0.1911,
"num_input_tokens_seen": 2947024,
"step": 200
},
{
"epoch": 3.9518072289156625,
"grad_norm": 1.666278600692749,
"learning_rate": 3.257881685282609e-05,
"loss": 0.1741,
"num_input_tokens_seen": 3016656,
"step": 205
},
{
"epoch": 4.048192771084337,
"grad_norm": 1.099639892578125,
"learning_rate": 3.1841574751802076e-05,
"loss": 0.1334,
"num_input_tokens_seen": 3084416,
"step": 210
},
{
"epoch": 4.144578313253012,
"grad_norm": 1.5020925998687744,
"learning_rate": 3.109784300270943e-05,
"loss": 0.1027,
"num_input_tokens_seen": 3166784,
"step": 215
},
{
"epoch": 4.240963855421687,
"grad_norm": 2.203794240951538,
"learning_rate": 3.0348327080162435e-05,
"loss": 0.0955,
"num_input_tokens_seen": 3239584,
"step": 220
},
{
"epoch": 4.337349397590361,
"grad_norm": 1.7183223962783813,
"learning_rate": 2.9593737945414264e-05,
"loss": 0.1006,
"num_input_tokens_seen": 3313360,
"step": 225
},
{
"epoch": 4.433734939759036,
"grad_norm": 1.4102908372879028,
"learning_rate": 2.8834791371967142e-05,
"loss": 0.1007,
"num_input_tokens_seen": 3377840,
"step": 230
},
{
"epoch": 4.530120481927711,
"grad_norm": 1.214020013809204,
"learning_rate": 2.8072207266617855e-05,
"loss": 0.1033,
"num_input_tokens_seen": 3455904,
"step": 235
},
{
"epoch": 4.626506024096385,
"grad_norm": 1.5255635976791382,
"learning_rate": 2.7306708986582553e-05,
"loss": 0.1023,
"num_input_tokens_seen": 3529360,
"step": 240
},
{
"epoch": 4.72289156626506,
"grad_norm": 1.6624009609222412,
"learning_rate": 2.653902265334858e-05,
"loss": 0.1121,
"num_input_tokens_seen": 3605344,
"step": 245
},
{
"epoch": 4.8192771084337345,
"grad_norm": 1.7999521493911743,
"learning_rate": 2.5769876463904265e-05,
"loss": 0.1028,
"num_input_tokens_seen": 3678352,
"step": 250
},
{
"epoch": 4.9156626506024095,
"grad_norm": 2.1297786235809326,
"learning_rate": 2.5e-05,
"loss": 0.1055,
"num_input_tokens_seen": 3752608,
"step": 255
},
{
"epoch": 5.0120481927710845,
"grad_norm": 1.215146780014038,
"learning_rate": 2.4230123536095748e-05,
"loss": 0.1037,
"num_input_tokens_seen": 3819744,
"step": 260
},
{
"epoch": 5.108433734939759,
"grad_norm": 1.448801040649414,
"learning_rate": 2.346097734665143e-05,
"loss": 0.0633,
"num_input_tokens_seen": 3896592,
"step": 265
},
{
"epoch": 5.204819277108434,
"grad_norm": 1.220989465713501,
"learning_rate": 2.2693291013417453e-05,
"loss": 0.0521,
"num_input_tokens_seen": 3970976,
"step": 270
},
{
"epoch": 5.301204819277109,
"grad_norm": 1.3077821731567383,
"learning_rate": 2.192779273338215e-05,
"loss": 0.0625,
"num_input_tokens_seen": 4051760,
"step": 275
},
{
"epoch": 5.397590361445783,
"grad_norm": 2.02695369720459,
"learning_rate": 2.116520862803286e-05,
"loss": 0.059,
"num_input_tokens_seen": 4124096,
"step": 280
},
{
"epoch": 5.493975903614458,
"grad_norm": 1.6377320289611816,
"learning_rate": 2.0406262054585738e-05,
"loss": 0.0648,
"num_input_tokens_seen": 4188448,
"step": 285
},
{
"epoch": 5.590361445783133,
"grad_norm": 1.6187361478805542,
"learning_rate": 1.965167291983757e-05,
"loss": 0.0709,
"num_input_tokens_seen": 4261056,
"step": 290
},
{
"epoch": 5.686746987951807,
"grad_norm": 1.4855268001556396,
"learning_rate": 1.890215699729057e-05,
"loss": 0.0641,
"num_input_tokens_seen": 4329024,
"step": 295
},
{
"epoch": 5.783132530120482,
"grad_norm": 1.4216831922531128,
"learning_rate": 1.815842524819793e-05,
"loss": 0.0689,
"num_input_tokens_seen": 4406624,
"step": 300
},
{
"epoch": 5.879518072289157,
"grad_norm": 1.7383759021759033,
"learning_rate": 1.7421183147173915e-05,
"loss": 0.055,
"num_input_tokens_seen": 4480352,
"step": 305
},
{
"epoch": 5.975903614457831,
"grad_norm": 1.5599803924560547,
"learning_rate": 1.6691130013008514e-05,
"loss": 0.0626,
"num_input_tokens_seen": 4554080,
"step": 310
},
{
"epoch": 6.072289156626506,
"grad_norm": 1.028124213218689,
"learning_rate": 1.5968958345321178e-05,
"loss": 0.0465,
"num_input_tokens_seen": 4628576,
"step": 315
},
{
"epoch": 6.168674698795181,
"grad_norm": 1.4686311483383179,
"learning_rate": 1.5255353167683017e-05,
"loss": 0.0421,
"num_input_tokens_seen": 4704512,
"step": 320
},
{
"epoch": 6.265060240963855,
"grad_norm": 1.1644634008407593,
"learning_rate": 1.4550991377830426e-05,
"loss": 0.0303,
"num_input_tokens_seen": 4776912,
"step": 325
},
{
"epoch": 6.36144578313253,
"grad_norm": 1.090997338294983,
"learning_rate": 1.3856541105586545e-05,
"loss": 0.0337,
"num_input_tokens_seen": 4855600,
"step": 330
},
{
"epoch": 6.457831325301205,
"grad_norm": 1.4336110353469849,
"learning_rate": 1.3172661079099752e-05,
"loss": 0.0333,
"num_input_tokens_seen": 4927600,
"step": 335
},
{
"epoch": 6.554216867469879,
"grad_norm": 1.3488271236419678,
"learning_rate": 1.2500000000000006e-05,
"loss": 0.039,
"num_input_tokens_seen": 5003184,
"step": 340
},
{
"epoch": 6.612048192771084,
"num_input_tokens_seen": 5052448,
"step": 343,
"total_flos": 2.28822660837802e+17,
"train_loss": 0.2300173058541106,
"train_runtime": 12675.9679,
"train_samples_per_second": 0.655,
"train_steps_per_second": 0.04
}
],
"logging_steps": 5,
"max_steps": 510,
"num_input_tokens_seen": 5052448,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.28822660837802e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}