{ "best_metric": 1.349927544593811, "best_model_checkpoint": "checkpoints/sft_2_1_1/checkpoint-2555", "epoch": 7.0, "eval_steps": 500, "global_step": 2555, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1506849315068493, "grad_norm": 18.4865665435791, "learning_rate": 5.018248175182482e-07, "loss": 2.5927, "step": 55 }, { "epoch": 0.3013698630136986, "grad_norm": 16.606660842895508, "learning_rate": 1.0036496350364965e-06, "loss": 2.3833, "step": 110 }, { "epoch": 0.4520547945205479, "grad_norm": 6.788235187530518, "learning_rate": 1.5054744525547446e-06, "loss": 1.8868, "step": 165 }, { "epoch": 0.6027397260273972, "grad_norm": 3.3164093494415283, "learning_rate": 2.007299270072993e-06, "loss": 1.5665, "step": 220 }, { "epoch": 0.7534246575342466, "grad_norm": 3.4226760864257812, "learning_rate": 2.509124087591241e-06, "loss": 1.4994, "step": 275 }, { "epoch": 0.9041095890410958, "grad_norm": 3.687007427215576, "learning_rate": 3.0109489051094893e-06, "loss": 1.4708, "step": 330 }, { "epoch": 1.0, "eval_loss": 1.4502822160720825, "eval_runtime": 41.7137, "eval_samples_per_second": 23.973, "eval_steps_per_second": 2.997, "step": 365 }, { "epoch": 1.0547945205479452, "grad_norm": 3.667193651199341, "learning_rate": 3.5127737226277376e-06, "loss": 1.4589, "step": 385 }, { "epoch": 1.2054794520547945, "grad_norm": 3.444368362426758, "learning_rate": 4.014598540145986e-06, "loss": 1.4383, "step": 440 }, { "epoch": 1.356164383561644, "grad_norm": 3.4761803150177, "learning_rate": 4.516423357664234e-06, "loss": 1.4421, "step": 495 }, { "epoch": 1.5068493150684932, "grad_norm": 3.8773984909057617, "learning_rate": 4.9999979671535945e-06, "loss": 1.4388, "step": 550 }, { "epoch": 1.6575342465753424, "grad_norm": 3.5462825298309326, "learning_rate": 4.998349002034396e-06, "loss": 1.4198, "step": 605 }, { "epoch": 1.808219178082192, "grad_norm": 3.9237027168273926, "learning_rate": 4.993627701726671e-06, "loss": 1.4052, "step": 660 }, { "epoch": 1.958904109589041, "grad_norm": 3.995187997817993, "learning_rate": 4.9858398722315225e-06, "loss": 1.4121, "step": 715 }, { "epoch": 2.0, "eval_loss": 1.4027259349822998, "eval_runtime": 41.7142, "eval_samples_per_second": 23.973, "eval_steps_per_second": 2.997, "step": 730 }, { "epoch": 2.1095890410958904, "grad_norm": 3.973104238510132, "learning_rate": 4.974995090602673e-06, "loss": 1.4018, "step": 770 }, { "epoch": 2.26027397260274, "grad_norm": 4.114542484283447, "learning_rate": 4.9611066931691045e-06, "loss": 1.3977, "step": 825 }, { "epoch": 2.410958904109589, "grad_norm": 4.350598335266113, "learning_rate": 4.94419175913477e-06, "loss": 1.3778, "step": 880 }, { "epoch": 2.5616438356164384, "grad_norm": 3.951005697250366, "learning_rate": 4.9242710895755e-06, "loss": 1.372, "step": 935 }, { "epoch": 2.712328767123288, "grad_norm": 4.071479797363281, "learning_rate": 4.9013691818589635e-06, "loss": 1.3826, "step": 990 }, { "epoch": 2.863013698630137, "grad_norm": 3.968268632888794, "learning_rate": 4.87551419951912e-06, "loss": 1.3845, "step": 1045 }, { "epoch": 3.0, "eval_loss": 1.3834009170532227, "eval_runtime": 41.778, "eval_samples_per_second": 23.936, "eval_steps_per_second": 2.992, "step": 1095 }, { "epoch": 3.0136986301369864, "grad_norm": 4.093992233276367, "learning_rate": 4.8467379376222215e-06, "loss": 1.3736, "step": 1100 }, { "epoch": 3.1643835616438354, "grad_norm": 4.021303176879883, "learning_rate": 4.815075783666952e-06, "loss": 1.3547, "step": 1155 }, { "epoch": 3.315068493150685, "grad_norm": 4.797937393188477, "learning_rate": 4.780566674066782e-06, "loss": 1.3671, "step": 1210 }, { "epoch": 3.4657534246575343, "grad_norm": 4.535392761230469, "learning_rate": 4.743253046268069e-06, "loss": 1.3545, "step": 1265 }, { "epoch": 3.616438356164384, "grad_norm": 4.504812717437744, "learning_rate": 4.703180786562761e-06, "loss": 1.3623, "step": 1320 }, { "epoch": 3.767123287671233, "grad_norm": 4.607705116271973, "learning_rate": 4.660399173659908e-06, "loss": 1.3487, "step": 1375 }, { "epoch": 3.9178082191780823, "grad_norm": 4.659298896789551, "learning_rate": 4.6149608180853545e-06, "loss": 1.3502, "step": 1430 }, { "epoch": 4.0, "eval_loss": 1.3703773021697998, "eval_runtime": 41.7996, "eval_samples_per_second": 23.924, "eval_steps_per_second": 2.99, "step": 1460 }, { "epoch": 4.068493150684931, "grad_norm": 4.691000461578369, "learning_rate": 4.566921597484149e-06, "loss": 1.3453, "step": 1485 }, { "epoch": 4.219178082191781, "grad_norm": 4.80633020401001, "learning_rate": 4.51634058790522e-06, "loss": 1.3329, "step": 1540 }, { "epoch": 4.36986301369863, "grad_norm": 5.040696144104004, "learning_rate": 4.463279991152828e-06, "loss": 1.3329, "step": 1595 }, { "epoch": 4.52054794520548, "grad_norm": 5.084527015686035, "learning_rate": 4.407805058294135e-06, "loss": 1.3453, "step": 1650 }, { "epoch": 4.671232876712329, "grad_norm": 5.078038692474365, "learning_rate": 4.349984009416952e-06, "loss": 1.3266, "step": 1705 }, { "epoch": 4.821917808219178, "grad_norm": 5.201215744018555, "learning_rate": 4.289887949736347e-06, "loss": 1.3281, "step": 1760 }, { "epoch": 4.972602739726027, "grad_norm": 4.974658966064453, "learning_rate": 4.227590782153277e-06, "loss": 1.3168, "step": 1815 }, { "epoch": 5.0, "eval_loss": 1.3636702299118042, "eval_runtime": 41.8147, "eval_samples_per_second": 23.915, "eval_steps_per_second": 2.989, "step": 1825 }, { "epoch": 5.123287671232877, "grad_norm": 5.115445137023926, "learning_rate": 4.16316911637277e-06, "loss": 1.3135, "step": 1870 }, { "epoch": 5.273972602739726, "grad_norm": 5.82274055480957, "learning_rate": 4.0967021746934436e-06, "loss": 1.3107, "step": 1925 }, { "epoch": 5.424657534246576, "grad_norm": 5.606359481811523, "learning_rate": 4.02827169458417e-06, "loss": 1.301, "step": 1980 }, { "epoch": 5.575342465753424, "grad_norm": 5.442434787750244, "learning_rate": 3.957961828167748e-06, "loss": 1.3171, "step": 2035 }, { "epoch": 5.726027397260274, "grad_norm": 5.444327354431152, "learning_rate": 3.885859038735141e-06, "loss": 1.3045, "step": 2090 }, { "epoch": 5.876712328767123, "grad_norm": 5.671774864196777, "learning_rate": 3.8120519944175767e-06, "loss": 1.3036, "step": 2145 }, { "epoch": 6.0, "eval_loss": 1.353081464767456, "eval_runtime": 41.6872, "eval_samples_per_second": 23.988, "eval_steps_per_second": 2.999, "step": 2190 }, { "epoch": 6.027397260273973, "grad_norm": 5.856392860412598, "learning_rate": 3.7366314591472484e-06, "loss": 1.2882, "step": 2200 }, { "epoch": 6.178082191780822, "grad_norm": 6.328695774078369, "learning_rate": 3.659690181040717e-06, "loss": 1.2881, "step": 2255 }, { "epoch": 6.328767123287671, "grad_norm": 6.592623710632324, "learning_rate": 3.5813227783422654e-06, "loss": 1.278, "step": 2310 }, { "epoch": 6.47945205479452, "grad_norm": 6.272197723388672, "learning_rate": 3.5016256230674704e-06, "loss": 1.2799, "step": 2365 }, { "epoch": 6.63013698630137, "grad_norm": 6.509876251220703, "learning_rate": 3.4206967224900885e-06, "loss": 1.2866, "step": 2420 }, { "epoch": 6.780821917808219, "grad_norm": 6.4894304275512695, "learning_rate": 3.338635598617975e-06, "loss": 1.2952, "step": 2475 }, { "epoch": 6.931506849315069, "grad_norm": 6.477168560028076, "learning_rate": 3.2555431658062837e-06, "loss": 1.2752, "step": 2530 }, { "epoch": 7.0, "eval_loss": 1.349927544593811, "eval_runtime": 41.6959, "eval_samples_per_second": 23.983, "eval_steps_per_second": 2.998, "step": 2555 } ], "logging_steps": 55, "max_steps": 5475, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.363484660255949e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }