{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.1595576619273302, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1263823064770932, "grad_norm": 1.039925456047058, "learning_rate": 4e-05, "loss": 2.1917, "step": 20 }, { "epoch": 0.2527646129541864, "grad_norm": 1.250934362411499, "learning_rate": 8e-05, "loss": 1.9713, "step": 40 }, { "epoch": 0.3791469194312796, "grad_norm": 1.2668324708938599, "learning_rate": 0.00012, "loss": 1.7553, "step": 60 }, { "epoch": 0.5055292259083728, "grad_norm": 1.822996973991394, "learning_rate": 0.00016, "loss": 1.6959, "step": 80 }, { "epoch": 0.631911532385466, "grad_norm": 1.8614917993545532, "learning_rate": 0.0002, "loss": 1.5757, "step": 100 }, { "epoch": 0.7582938388625592, "grad_norm": 2.021389961242676, "learning_rate": 0.0001924812030075188, "loss": 1.5615, "step": 120 }, { "epoch": 0.8846761453396524, "grad_norm": 1.8619110584259033, "learning_rate": 0.0001849624060150376, "loss": 1.4426, "step": 140 }, { "epoch": 1.0110584518167456, "grad_norm": 1.8433139324188232, "learning_rate": 0.0001774436090225564, "loss": 1.4429, "step": 160 }, { "epoch": 1.1374407582938388, "grad_norm": 2.324719190597534, "learning_rate": 0.0001699248120300752, "loss": 1.2625, "step": 180 }, { "epoch": 1.263823064770932, "grad_norm": 1.7142627239227295, "learning_rate": 0.00016240601503759398, "loss": 1.2677, "step": 200 }, { "epoch": 1.3902053712480253, "grad_norm": 2.427729606628418, "learning_rate": 0.0001548872180451128, "loss": 1.248, "step": 220 }, { "epoch": 1.5165876777251186, "grad_norm": 2.2092034816741943, "learning_rate": 0.00014736842105263158, "loss": 1.1584, "step": 240 }, { "epoch": 1.6429699842022116, "grad_norm": 1.9174597263336182, "learning_rate": 0.0001398496240601504, "loss": 1.1811, "step": 260 }, { "epoch": 1.7693522906793049, "grad_norm": 2.3690454959869385, "learning_rate": 0.00013233082706766918, "loss": 1.1186, "step": 280 }, { "epoch": 1.8957345971563981, "grad_norm": 1.9550652503967285, "learning_rate": 0.00012481203007518797, "loss": 1.1149, "step": 300 }, { "epoch": 2.022116903633491, "grad_norm": 2.081315517425537, "learning_rate": 0.00011729323308270677, "loss": 1.0422, "step": 320 }, { "epoch": 2.1484992101105846, "grad_norm": 3.5079455375671387, "learning_rate": 0.00010977443609022557, "loss": 0.8674, "step": 340 }, { "epoch": 2.2748815165876777, "grad_norm": 2.9713032245635986, "learning_rate": 0.00010225563909774436, "loss": 0.8312, "step": 360 }, { "epoch": 2.401263823064771, "grad_norm": 3.1877429485321045, "learning_rate": 9.473684210526316e-05, "loss": 0.7915, "step": 380 }, { "epoch": 2.527646129541864, "grad_norm": 3.9969420433044434, "learning_rate": 8.721804511278195e-05, "loss": 0.8997, "step": 400 }, { "epoch": 2.654028436018957, "grad_norm": 3.038134813308716, "learning_rate": 7.969924812030075e-05, "loss": 0.7495, "step": 420 }, { "epoch": 2.7804107424960507, "grad_norm": 2.5654478073120117, "learning_rate": 7.218045112781955e-05, "loss": 0.7727, "step": 440 }, { "epoch": 2.9067930489731437, "grad_norm": 3.3609306812286377, "learning_rate": 6.466165413533834e-05, "loss": 0.8009, "step": 460 }, { "epoch": 3.0331753554502368, "grad_norm": 3.048936367034912, "learning_rate": 5.714285714285714e-05, "loss": 0.753, "step": 480 }, { "epoch": 3.1595576619273302, "grad_norm": 2.0320680141448975, "learning_rate": 4.9624060150375936e-05, "loss": 0.5853, "step": 500 } ], "logging_steps": 20, "max_steps": 632, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8241099229184e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }