{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.418098912662224, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11, "grad_norm": 0.5909375548362732, "learning_rate": 1.9932584269662923e-05, "loss": 2.0237, "step": 20 }, { "epoch": 0.22, "grad_norm": 0.5826025009155273, "learning_rate": 1.9857677902621722e-05, "loss": 1.9306, "step": 40 }, { "epoch": 0.34, "grad_norm": 0.5491089820861816, "learning_rate": 1.9782771535580525e-05, "loss": 1.7959, "step": 60 }, { "epoch": 0.45, "grad_norm": 1.362810730934143, "learning_rate": 1.970786516853933e-05, "loss": 1.6599, "step": 80 }, { "epoch": 0.56, "grad_norm": 1.4427486658096313, "learning_rate": 1.963295880149813e-05, "loss": 1.5685, "step": 100 }, { "epoch": 0.67, "grad_norm": 0.9993659257888794, "learning_rate": 1.956179775280899e-05, "loss": 1.4621, "step": 120 }, { "epoch": 0.79, "grad_norm": 1.614562749862671, "learning_rate": 1.9486891385767793e-05, "loss": 1.31, "step": 140 }, { "epoch": 0.9, "grad_norm": 1.1975798606872559, "learning_rate": 1.9411985018726593e-05, "loss": 1.2322, "step": 160 }, { "epoch": 1.01, "grad_norm": 0.7684128880500793, "learning_rate": 1.9337078651685396e-05, "loss": 1.1361, "step": 180 }, { "epoch": 1.12, "grad_norm": 0.9336960911750793, "learning_rate": 1.9262172284644195e-05, "loss": 1.0797, "step": 200 }, { "epoch": 1.23, "grad_norm": 0.8471770882606506, "learning_rate": 1.9187265917603e-05, "loss": 1.0368, "step": 220 }, { "epoch": 1.35, "grad_norm": 1.111340045928955, "learning_rate": 1.9112359550561798e-05, "loss": 0.9738, "step": 240 }, { "epoch": 1.46, "grad_norm": 0.8093781471252441, "learning_rate": 1.90374531835206e-05, "loss": 0.9494, "step": 260 }, { "epoch": 1.57, "grad_norm": 0.8438062071800232, "learning_rate": 1.89625468164794e-05, "loss": 0.9276, "step": 280 }, { "epoch": 1.68, "grad_norm": 0.9896701574325562, "learning_rate": 1.8887640449438204e-05, "loss": 0.8656, "step": 300 }, { "epoch": 1.8, "grad_norm": 0.8278244137763977, "learning_rate": 1.8812734082397007e-05, "loss": 0.8431, "step": 320 }, { "epoch": 1.91, "grad_norm": 0.931291937828064, "learning_rate": 1.8737827715355807e-05, "loss": 0.7945, "step": 340 }, { "epoch": 2.02, "grad_norm": 1.21769380569458, "learning_rate": 1.866292134831461e-05, "loss": 0.7647, "step": 360 }, { "epoch": 2.13, "grad_norm": 3.5183286666870117, "learning_rate": 1.858801498127341e-05, "loss": 0.7497, "step": 380 }, { "epoch": 2.24, "grad_norm": 1.1153030395507812, "learning_rate": 1.8513108614232212e-05, "loss": 0.7507, "step": 400 }, { "epoch": 2.36, "grad_norm": 1.0140526294708252, "learning_rate": 1.8438202247191012e-05, "loss": 0.7415, "step": 420 }, { "epoch": 2.47, "grad_norm": 1.4395232200622559, "learning_rate": 1.8363295880149815e-05, "loss": 0.6947, "step": 440 }, { "epoch": 2.58, "grad_norm": 1.4253089427947998, "learning_rate": 1.8288389513108615e-05, "loss": 0.7429, "step": 460 }, { "epoch": 2.69, "grad_norm": 1.3152351379394531, "learning_rate": 1.8213483146067418e-05, "loss": 0.7363, "step": 480 }, { "epoch": 2.81, "grad_norm": 2.5935957431793213, "learning_rate": 1.8138576779026217e-05, "loss": 0.6486, "step": 500 }, { "epoch": 2.92, "grad_norm": 3.929158926010132, "learning_rate": 1.806367041198502e-05, "loss": 0.6395, "step": 520 }, { "epoch": 3.03, "grad_norm": 1.7316572666168213, "learning_rate": 1.7988764044943823e-05, "loss": 0.664, "step": 540 }, { "epoch": 3.14, "grad_norm": 1.3388841152191162, "learning_rate": 1.7913857677902623e-05, "loss": 0.6469, "step": 560 }, { "epoch": 3.25, "grad_norm": 1.5258549451828003, "learning_rate": 1.7838951310861426e-05, "loss": 0.6662, "step": 580 }, { "epoch": 3.37, "grad_norm": 1.5486094951629639, "learning_rate": 1.7764044943820226e-05, "loss": 0.566, "step": 600 }, { "epoch": 3.48, "grad_norm": 1.5657902956008911, "learning_rate": 1.768913857677903e-05, "loss": 0.6166, "step": 620 }, { "epoch": 3.59, "grad_norm": 1.5971391201019287, "learning_rate": 1.761423220973783e-05, "loss": 0.5973, "step": 640 }, { "epoch": 3.7, "grad_norm": 1.333030343055725, "learning_rate": 1.753932584269663e-05, "loss": 0.6117, "step": 660 }, { "epoch": 3.82, "grad_norm": 1.4425445795059204, "learning_rate": 1.746441947565543e-05, "loss": 0.5702, "step": 680 }, { "epoch": 3.93, "grad_norm": 1.4773032665252686, "learning_rate": 1.7389513108614234e-05, "loss": 0.5465, "step": 700 }, { "epoch": 4.04, "grad_norm": 1.3328267335891724, "learning_rate": 1.7314606741573034e-05, "loss": 0.5379, "step": 720 }, { "epoch": 4.15, "grad_norm": 1.6961455345153809, "learning_rate": 1.7239700374531837e-05, "loss": 0.5492, "step": 740 }, { "epoch": 4.27, "grad_norm": 1.4636189937591553, "learning_rate": 1.7164794007490637e-05, "loss": 0.547, "step": 760 }, { "epoch": 4.38, "grad_norm": 2.1686649322509766, "learning_rate": 1.708988764044944e-05, "loss": 0.5424, "step": 780 }, { "epoch": 4.49, "grad_norm": 1.219388723373413, "learning_rate": 1.7014981273408243e-05, "loss": 0.5373, "step": 800 }, { "epoch": 4.6, "grad_norm": 1.5566452741622925, "learning_rate": 1.6940074906367042e-05, "loss": 0.4944, "step": 820 }, { "epoch": 4.71, "grad_norm": 1.598917841911316, "learning_rate": 1.6865168539325845e-05, "loss": 0.5036, "step": 840 }, { "epoch": 4.83, "grad_norm": 1.5281039476394653, "learning_rate": 1.6790262172284645e-05, "loss": 0.5215, "step": 860 }, { "epoch": 4.94, "grad_norm": 1.7123130559921265, "learning_rate": 1.6715355805243448e-05, "loss": 0.5362, "step": 880 }, { "epoch": 5.05, "grad_norm": 1.543447732925415, "learning_rate": 1.6640449438202248e-05, "loss": 0.5379, "step": 900 }, { "epoch": 5.16, "grad_norm": 2.4190192222595215, "learning_rate": 1.656554307116105e-05, "loss": 0.4921, "step": 920 }, { "epoch": 5.28, "grad_norm": 2.190906047821045, "learning_rate": 1.649063670411985e-05, "loss": 0.4652, "step": 940 }, { "epoch": 5.39, "grad_norm": 2.113476514816284, "learning_rate": 1.6415730337078653e-05, "loss": 0.4914, "step": 960 }, { "epoch": 5.5, "grad_norm": 1.8785656690597534, "learning_rate": 1.6340823970037453e-05, "loss": 0.5135, "step": 980 }, { "epoch": 5.61, "grad_norm": 1.3745977878570557, "learning_rate": 1.6265917602996256e-05, "loss": 0.4697, "step": 1000 }, { "epoch": 5.72, "grad_norm": 1.7874308824539185, "learning_rate": 1.6191011235955056e-05, "loss": 0.4625, "step": 1020 }, { "epoch": 5.84, "grad_norm": 1.4448940753936768, "learning_rate": 1.611610486891386e-05, "loss": 0.4764, "step": 1040 }, { "epoch": 5.95, "grad_norm": 2.278655767440796, "learning_rate": 1.6041198501872662e-05, "loss": 0.4221, "step": 1060 }, { "epoch": 6.06, "grad_norm": 1.8602409362792969, "learning_rate": 1.596629213483146e-05, "loss": 0.4731, "step": 1080 }, { "epoch": 6.17, "grad_norm": 1.884373426437378, "learning_rate": 1.5891385767790265e-05, "loss": 0.4241, "step": 1100 }, { "epoch": 6.29, "grad_norm": 2.0259287357330322, "learning_rate": 1.5816479400749064e-05, "loss": 0.4368, "step": 1120 }, { "epoch": 6.4, "grad_norm": 1.812462329864502, "learning_rate": 1.5741573033707867e-05, "loss": 0.442, "step": 1140 }, { "epoch": 6.51, "grad_norm": 1.934327483177185, "learning_rate": 1.5666666666666667e-05, "loss": 0.4195, "step": 1160 }, { "epoch": 6.62, "grad_norm": 1.6152955293655396, "learning_rate": 1.559176029962547e-05, "loss": 0.4374, "step": 1180 }, { "epoch": 6.73, "grad_norm": 2.7782068252563477, "learning_rate": 1.551685393258427e-05, "loss": 0.4231, "step": 1200 }, { "epoch": 6.85, "grad_norm": 2.372976303100586, "learning_rate": 1.5441947565543073e-05, "loss": 0.444, "step": 1220 }, { "epoch": 6.96, "grad_norm": 2.171353816986084, "learning_rate": 1.5367041198501872e-05, "loss": 0.4389, "step": 1240 }, { "epoch": 7.07, "grad_norm": 1.3093984127044678, "learning_rate": 1.5292134831460675e-05, "loss": 0.4301, "step": 1260 }, { "epoch": 7.18, "grad_norm": 2.267932176589966, "learning_rate": 1.5217228464419478e-05, "loss": 0.4046, "step": 1280 }, { "epoch": 7.3, "grad_norm": 1.5326164960861206, "learning_rate": 1.514232209737828e-05, "loss": 0.4068, "step": 1300 }, { "epoch": 7.41, "grad_norm": 3.1525979042053223, "learning_rate": 1.5067415730337081e-05, "loss": 0.3847, "step": 1320 }, { "epoch": 7.52, "grad_norm": 2.081890106201172, "learning_rate": 1.4992509363295882e-05, "loss": 0.4126, "step": 1340 }, { "epoch": 7.63, "grad_norm": 2.5701358318328857, "learning_rate": 1.4917602996254684e-05, "loss": 0.4065, "step": 1360 }, { "epoch": 7.74, "grad_norm": 1.4190051555633545, "learning_rate": 1.4842696629213485e-05, "loss": 0.3979, "step": 1380 }, { "epoch": 7.86, "grad_norm": 1.9085837602615356, "learning_rate": 1.4767790262172286e-05, "loss": 0.3894, "step": 1400 }, { "epoch": 7.97, "grad_norm": 1.7573003768920898, "learning_rate": 1.4692883895131088e-05, "loss": 0.3751, "step": 1420 }, { "epoch": 8.08, "grad_norm": 1.8974506855010986, "learning_rate": 1.4617977528089889e-05, "loss": 0.3936, "step": 1440 }, { "epoch": 8.19, "grad_norm": 1.3843660354614258, "learning_rate": 1.454307116104869e-05, "loss": 0.3848, "step": 1460 }, { "epoch": 8.31, "grad_norm": 1.525007724761963, "learning_rate": 1.4468164794007492e-05, "loss": 0.3552, "step": 1480 }, { "epoch": 8.42, "grad_norm": 2.1665101051330566, "learning_rate": 1.4393258426966291e-05, "loss": 0.3547, "step": 1500 } ], "logging_steps": 20, "max_steps": 5340, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "total_flos": 3.8986916806656e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }