Model save

Browse files

Files changed (4) hide show

README.md +61 -0
all_results.json +9 -0
train_results.json +9 -0
trainer_state.json +1008 -0

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+base_model: meta-llama/Meta-Llama-3-8B-Instruct
+library_name: peft
+license: llama3
+tags:
+- trl
+- sft
+- generated_from_trainer
+model-index:
+- name: llama3-sudo-3epochs-mask
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# llama3-sudo-3epochs-mask
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on an unknown dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 8
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- total_train_batch_size: 64
+- total_eval_batch_size: 64
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 3
+### Training results
+### Framework versions
+- PEFT 0.12.0
+- Transformers 4.44.0
+- Pytorch 2.1.2
+- Datasets 2.20.0
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1562393690767360.0,
+    "train_loss": 1.5581738730184898,
+    "train_runtime": 1339.3383,
+    "train_samples": 14642,
+    "train_samples_per_second": 32.797,
+    "train_steps_per_second": 0.513
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1562393690767360.0,
+    "train_loss": 1.5581738730184898,
+    "train_runtime": 1339.3383,
+    "train_samples": 14642,
+    "train_samples_per_second": 32.797,
+    "train_steps_per_second": 0.513
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1008 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 687,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.004366812227074236,
+      "grad_norm": 1.8753604454650958,
+      "learning_rate": 2.898550724637681e-06,
+      "loss": 3.7085,
+      "step": 1
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 1.801543636467953,
+      "learning_rate": 1.4492753623188407e-05,
+      "loss": 3.628,
+      "step": 5
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 2.2358499812546344,
+      "learning_rate": 2.8985507246376814e-05,
+      "loss": 3.5205,
+      "step": 10
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 3.2738653822216937,
+      "learning_rate": 4.347826086956522e-05,
+      "loss": 3.3516,
+      "step": 15
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 1.2942851623807274,
+      "learning_rate": 5.797101449275363e-05,
+      "loss": 2.7683,
+      "step": 20
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.6794530467895241,
+      "learning_rate": 7.246376811594203e-05,
+      "loss": 2.4019,
+      "step": 25
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 1.2145036683171,
+      "learning_rate": 8.695652173913044e-05,
+      "loss": 2.1421,
+      "step": 30
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.8541800089072681,
+      "learning_rate": 0.00010144927536231885,
+      "loss": 1.9309,
+      "step": 35
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.5114546320288847,
+      "learning_rate": 0.00011594202898550725,
+      "loss": 1.9047,
+      "step": 40
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.4569825196048275,
+      "learning_rate": 0.00013043478260869567,
+      "loss": 1.8231,
+      "step": 45
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.5192344724636687,
+      "learning_rate": 0.00014492753623188405,
+      "loss": 1.7305,
+      "step": 50
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.34840209414620005,
+      "learning_rate": 0.00015942028985507247,
+      "loss": 1.7933,
+      "step": 55
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.3678371866239178,
+      "learning_rate": 0.00017391304347826088,
+      "loss": 1.6143,
+      "step": 60
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.35400819586929,
+      "learning_rate": 0.00018840579710144927,
+      "loss": 1.7367,
+      "step": 65
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.38841557534962395,
+      "learning_rate": 0.00019999870791268066,
+      "loss": 1.6669,
+      "step": 70
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.4474417485361201,
+      "learning_rate": 0.00019995348836233516,
+      "loss": 1.601,
+      "step": 75
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.35698828488835516,
+      "learning_rate": 0.00019984369783193688,
+      "loss": 1.5357,
+      "step": 80
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.3585527157477228,
+      "learning_rate": 0.00019966940724729603,
+      "loss": 1.6221,
+      "step": 85
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.34800578812888927,
+      "learning_rate": 0.0001994307292019204,
+      "loss": 1.6741,
+      "step": 90
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.35237447031832875,
+      "learning_rate": 0.0001991278178842786,
+      "loss": 1.5682,
+      "step": 95
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.39076567242886867,
+      "learning_rate": 0.00019876086897819284,
+      "loss": 1.5629,
+      "step": 100
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.34983112900974217,
+      "learning_rate": 0.00019833011953642525,
+      "loss": 1.6151,
+      "step": 105
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.34530341844436674,
+      "learning_rate": 0.00019783584782753918,
+      "loss": 1.5494,
+      "step": 110
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.38496791747153597,
+      "learning_rate": 0.00019727837315613504,
+      "loss": 1.5526,
+      "step": 115
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.35057371546794386,
+      "learning_rate": 0.00019665805565657603,
+      "loss": 1.5933,
+      "step": 120
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.35636722375213375,
+      "learning_rate": 0.00019597529606033782,
+      "loss": 1.5726,
+      "step": 125
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.375075170102756,
+      "learning_rate": 0.0001952305354371319,
+      "loss": 1.572,
+      "step": 130
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.38062791914729815,
+      "learning_rate": 0.00019442425490996988,
+      "loss": 1.5393,
+      "step": 135
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.3465840806200646,
+      "learning_rate": 0.0001935569753443532,
+      "loss": 1.502,
+      "step": 140
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.3566623409862706,
+      "learning_rate": 0.00019262925701178866,
+      "loss": 1.6075,
+      "step": 145
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.3513669246328785,
+      "learning_rate": 0.00019164169922784716,
+      "loss": 1.5125,
+      "step": 150
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.34755655244144995,
+      "learning_rate": 0.00019059493996499986,
+      "loss": 1.6031,
+      "step": 155
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.35330261801172397,
+      "learning_rate": 0.00018948965544048128,
+      "loss": 1.5862,
+      "step": 160
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.3540773871103764,
+      "learning_rate": 0.00018832655967944607,
+      "loss": 1.6557,
+      "step": 165
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.3385589855658203,
+      "learning_rate": 0.00018710640405370145,
+      "loss": 1.5771,
+      "step": 170
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.3948832031761469,
+      "learning_rate": 0.00018582997679631315,
+      "loss": 1.5896,
+      "step": 175
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.3414806859938428,
+      "learning_rate": 0.00018449810249239902,
+      "loss": 1.5278,
+      "step": 180
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.3306470223687427,
+      "learning_rate": 0.00018311164154643836,
+      "loss": 1.4916,
+      "step": 185
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.37143716375731756,
+      "learning_rate": 0.00018167148962644193,
+      "loss": 1.625,
+      "step": 190
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.3610023951445998,
+      "learning_rate": 0.00018017857708534107,
+      "loss": 1.6859,
+      "step": 195
+    },
+    {
+      "epoch": 0.8733624454148472,
+      "grad_norm": 0.3969039590867808,
+      "learning_rate": 0.00017863386835997028,
+      "loss": 1.6366,
+      "step": 200
+    },
+    {
+      "epoch": 0.8951965065502183,
+      "grad_norm": 0.3466218548962297,
+      "learning_rate": 0.00017703836134803105,
+      "loss": 1.4699,
+      "step": 205
+    },
+    {
+      "epoch": 0.9170305676855895,
+      "grad_norm": 0.35771965160066704,
+      "learning_rate": 0.00017539308676343973,
+      "loss": 1.5723,
+      "step": 210
+    },
+    {
+      "epoch": 0.9388646288209607,
+      "grad_norm": 0.37691637209544226,
+      "learning_rate": 0.00017369910747047572,
+      "loss": 1.584,
+      "step": 215
+    },
+    {
+      "epoch": 0.9606986899563319,
+      "grad_norm": 0.40411027091446045,
+      "learning_rate": 0.00017195751779716027,
+      "loss": 1.6019,
+      "step": 220
+    },
+    {
+      "epoch": 0.982532751091703,
+      "grad_norm": 0.3742848841742168,
+      "learning_rate": 0.00017016944282830933,
+      "loss": 1.4947,
+      "step": 225
+    },
+    {
+      "epoch": 1.0043668122270741,
+      "grad_norm": 0.38407235492099207,
+      "learning_rate": 0.00016833603767871713,
+      "loss": 1.5812,
+      "step": 230
+    },
+    {
+      "epoch": 1.0262008733624455,
+      "grad_norm": 0.37035584561109924,
+      "learning_rate": 0.0001664584867469403,
+      "loss": 1.5113,
+      "step": 235
+    },
+    {
+      "epoch": 1.0480349344978166,
+      "grad_norm": 0.4171769928455508,
+      "learning_rate": 0.0001645380029501641,
+      "loss": 1.427,
+      "step": 240
+    },
+    {
+      "epoch": 1.0698689956331877,
+      "grad_norm": 0.4014807349448744,
+      "learning_rate": 0.00016257582694064558,
+      "loss": 1.488,
+      "step": 245
+    },
+    {
+      "epoch": 1.091703056768559,
+      "grad_norm": 0.41603472494317567,
+      "learning_rate": 0.00016057322630423935,
+      "loss": 1.4085,
+      "step": 250
+    },
+    {
+      "epoch": 1.1135371179039302,
+      "grad_norm": 0.41696638930397173,
+      "learning_rate": 0.00015853149474152423,
+      "loss": 1.417,
+      "step": 255
+    },
+    {
+      "epoch": 1.1353711790393013,
+      "grad_norm": 0.42289781015455985,
+      "learning_rate": 0.0001564519512320593,
+      "loss": 1.4374,
+      "step": 260
+    },
+    {
+      "epoch": 1.1572052401746724,
+      "grad_norm": 0.4136181415806817,
+      "learning_rate": 0.00015433593918230955,
+      "loss": 1.5384,
+      "step": 265
+    },
+    {
+      "epoch": 1.1790393013100438,
+      "grad_norm": 0.4304951924614819,
+      "learning_rate": 0.00015218482555779165,
+      "loss": 1.4184,
+      "step": 270
+    },
+    {
+      "epoch": 1.2008733624454149,
+      "grad_norm": 0.4567200712003195,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.5038,
+      "step": 275
+    },
+    {
+      "epoch": 1.222707423580786,
+      "grad_norm": 0.418159543142196,
+      "learning_rate": 0.00014778287392868417,
+      "loss": 1.4477,
+      "step": 280
+    },
+    {
+      "epoch": 1.244541484716157,
+      "grad_norm": 0.43554491426674474,
+      "learning_rate": 0.0001455348796300571,
+      "loss": 1.362,
+      "step": 285
+    },
+    {
+      "epoch": 1.2663755458515285,
+      "grad_norm": 0.4303389630106061,
+      "learning_rate": 0.0001432574693315238,
+      "loss": 1.5283,
+      "step": 290
+    },
+    {
+      "epoch": 1.2882096069868996,
+      "grad_norm": 0.4434624782109262,
+      "learning_rate": 0.0001409521142635272,
+      "loss": 1.5192,
+      "step": 295
+    },
+    {
+      "epoch": 1.3100436681222707,
+      "grad_norm": 0.4160064760695591,
+      "learning_rate": 0.0001386203037091183,
+      "loss": 1.5222,
+      "step": 300
+    },
+    {
+      "epoch": 1.3318777292576418,
+      "grad_norm": 0.44897734162151265,
+      "learning_rate": 0.00013626354404186404,
+      "loss": 1.5612,
+      "step": 305
+    },
+    {
+      "epoch": 1.3537117903930131,
+      "grad_norm": 0.4378993469096492,
+      "learning_rate": 0.00013388335775271467,
+      "loss": 1.4442,
+      "step": 310
+    },
+    {
+      "epoch": 1.3755458515283843,
+      "grad_norm": 0.4403089794683692,
+      "learning_rate": 0.0001314812824664585,
+      "loss": 1.5151,
+      "step": 315
+    },
+    {
+      "epoch": 1.3973799126637554,
+      "grad_norm": 0.44038010399377847,
+      "learning_rate": 0.000129058869948401,
+      "loss": 1.5635,
+      "step": 320
+    },
+    {
+      "epoch": 1.4192139737991267,
+      "grad_norm": 0.4640302354831066,
+      "learning_rate": 0.00012661768510190816,
+      "loss": 1.4904,
+      "step": 325
+    },
+    {
+      "epoch": 1.4410480349344978,
+      "grad_norm": 0.45432337967027536,
+      "learning_rate": 0.00012415930495746302,
+      "loss": 1.5045,
+      "step": 330
+    },
+    {
+      "epoch": 1.462882096069869,
+      "grad_norm": 0.451657842641418,
+      "learning_rate": 0.00012168531765388755,
+      "loss": 1.392,
+      "step": 335
+    },
+    {
+      "epoch": 1.48471615720524,
+      "grad_norm": 0.43285118470181244,
+      "learning_rate": 0.00011919732141238898,
+      "loss": 1.461,
+      "step": 340
+    },
+    {
+      "epoch": 1.5065502183406112,
+      "grad_norm": 0.4819434243733131,
+      "learning_rate": 0.00011669692350409223,
+      "loss": 1.6045,
+      "step": 345
+    },
+    {
+      "epoch": 1.5283842794759825,
+      "grad_norm": 0.440140827625262,
+      "learning_rate": 0.00011418573921172635,
+      "loss": 1.5039,
+      "step": 350
+    },
+    {
+      "epoch": 1.5502183406113537,
+      "grad_norm": 0.4475130341158297,
+      "learning_rate": 0.00011166539078613525,
+      "loss": 1.5163,
+      "step": 355
+    },
+    {
+      "epoch": 1.572052401746725,
+      "grad_norm": 0.4563951417730544,
+      "learning_rate": 0.00010913750639828711,
+      "loss": 1.4896,
+      "step": 360
+    },
+    {
+      "epoch": 1.5938864628820961,
+      "grad_norm": 0.4570029980279654,
+      "learning_rate": 0.0001066037190874591,
+      "loss": 1.4318,
+      "step": 365
+    },
+    {
+      "epoch": 1.6157205240174672,
+      "grad_norm": 0.45220265285594413,
+      "learning_rate": 0.00010406566570627713,
+      "loss": 1.484,
+      "step": 370
+    },
+    {
+      "epoch": 1.6375545851528384,
+      "grad_norm": 0.43217867151367745,
+      "learning_rate": 0.0001015249858632926,
+      "loss": 1.3569,
+      "step": 375
+    },
+    {
+      "epoch": 1.6593886462882095,
+      "grad_norm": 0.45524805316851796,
+      "learning_rate": 9.898332086377805e-05,
+      "loss": 1.5309,
+      "step": 380
+    },
+    {
+      "epoch": 1.6812227074235808,
+      "grad_norm": 0.45864967058418454,
+      "learning_rate": 9.644231264942724e-05,
+      "loss": 1.5568,
+      "step": 385
+    },
+    {
+      "epoch": 1.703056768558952,
+      "grad_norm": 0.46156320050595717,
+      "learning_rate": 9.390360273764411e-05,
+      "loss": 1.5541,
+      "step": 390
+    },
+    {
+      "epoch": 1.7248908296943233,
+      "grad_norm": 0.4424924245408702,
+      "learning_rate": 9.136883116110542e-05,
+      "loss": 1.4779,
+      "step": 395
+    },
+    {
+      "epoch": 1.7467248908296944,
+      "grad_norm": 0.4631844693338652,
+      "learning_rate": 8.88396354082829e-05,
+      "loss": 1.5041,
+      "step": 400
+    },
+    {
+      "epoch": 1.7685589519650655,
+      "grad_norm": 0.49717848531939474,
+      "learning_rate": 8.6317649365609e-05,
+      "loss": 1.4976,
+      "step": 405
+    },
+    {
+      "epoch": 1.7903930131004366,
+      "grad_norm": 0.44110942790393187,
+      "learning_rate": 8.380450226196925e-05,
+      "loss": 1.3881,
+      "step": 410
+    },
+    {
+      "epoch": 1.8122270742358078,
+      "grad_norm": 0.5089589581007182,
+      "learning_rate": 8.130181761620392e-05,
+      "loss": 1.4779,
+      "step": 415
+    },
+    {
+      "epoch": 1.8340611353711789,
+      "grad_norm": 0.4698355464759574,
+      "learning_rate": 7.881121218829787e-05,
+      "loss": 1.4198,
+      "step": 420
+    },
+    {
+      "epoch": 1.8558951965065502,
+      "grad_norm": 0.4659031789449208,
+      "learning_rate": 7.63342949349373e-05,
+      "loss": 1.4861,
+      "step": 425
+    },
+    {
+      "epoch": 1.8777292576419216,
+      "grad_norm": 0.44894777329606517,
+      "learning_rate": 7.387266597010704e-05,
+      "loss": 1.503,
+      "step": 430
+    },
+    {
+      "epoch": 1.8995633187772927,
+      "grad_norm": 0.4475374854045599,
+      "learning_rate": 7.142791553140045e-05,
+      "loss": 1.5077,
+      "step": 435
+    },
+    {
+      "epoch": 1.9213973799126638,
+      "grad_norm": 0.4750409585386964,
+      "learning_rate": 6.900162295270968e-05,
+      "loss": 1.515,
+      "step": 440
+    },
+    {
+      "epoch": 1.943231441048035,
+      "grad_norm": 0.4786776337696274,
+      "learning_rate": 6.659535564395982e-05,
+      "loss": 1.5167,
+      "step": 445
+    },
+    {
+      "epoch": 1.965065502183406,
+      "grad_norm": 0.5012494244267932,
+      "learning_rate": 6.421066807854584e-05,
+      "loss": 1.5364,
+      "step": 450
+    },
+    {
+      "epoch": 1.9868995633187772,
+      "grad_norm": 0.445678005467061,
+      "learning_rate": 6.184910078912687e-05,
+      "loss": 1.4215,
+      "step": 455
+    },
+    {
+      "epoch": 2.0087336244541483,
+      "grad_norm": 0.4871967245962684,
+      "learning_rate": 5.9512179372426325e-05,
+      "loss": 1.4481,
+      "step": 460
+    },
+    {
+      "epoch": 2.03056768558952,
+      "grad_norm": 0.48019280605703346,
+      "learning_rate": 5.720141350368072e-05,
+      "loss": 1.45,
+      "step": 465
+    },
+    {
+      "epoch": 2.052401746724891,
+      "grad_norm": 0.5224256236632303,
+      "learning_rate": 5.4918295961373923e-05,
+      "loss": 1.4061,
+      "step": 470
+    },
+    {
+      "epoch": 2.074235807860262,
+      "grad_norm": 0.5108294729537572,
+      "learning_rate": 5.266430166288705e-05,
+      "loss": 1.3943,
+      "step": 475
+    },
+    {
+      "epoch": 2.096069868995633,
+      "grad_norm": 0.5371496292909286,
+      "learning_rate": 5.044088671168644e-05,
+      "loss": 1.3578,
+      "step": 480
+    },
+    {
+      "epoch": 2.1179039301310043,
+      "grad_norm": 0.5481243225476391,
+      "learning_rate": 4.824948745666621e-05,
+      "loss": 1.3686,
+      "step": 485
+    },
+    {
+      "epoch": 2.1397379912663754,
+      "grad_norm": 0.5490865552703573,
+      "learning_rate": 4.6091519564251793e-05,
+      "loss": 1.3655,
+      "step": 490
+    },
+    {
+      "epoch": 2.1615720524017465,
+      "grad_norm": 0.5341608991319753,
+      "learning_rate": 4.3968377103865024e-05,
+      "loss": 1.3681,
+      "step": 495
+    },
+    {
+      "epoch": 2.183406113537118,
+      "grad_norm": 0.5539995610447883,
+      "learning_rate": 4.1881431647341054e-05,
+      "loss": 1.3703,
+      "step": 500
+    },
+    {
+      "epoch": 2.2052401746724892,
+      "grad_norm": 0.5617916414503629,
+      "learning_rate": 3.9832031382878766e-05,
+      "loss": 1.3506,
+      "step": 505
+    },
+    {
+      "epoch": 2.2270742358078603,
+      "grad_norm": 0.5457137690561883,
+      "learning_rate": 3.7821500244097274e-05,
+      "loss": 1.33,
+      "step": 510
+    },
+    {
+      "epoch": 2.2489082969432315,
+      "grad_norm": 0.5996161081223403,
+      "learning_rate": 3.585113705476143e-05,
+      "loss": 1.389,
+      "step": 515
+    },
+    {
+      "epoch": 2.2707423580786026,
+      "grad_norm": 0.5451761149567781,
+      "learning_rate": 3.392221468972805e-05,
+      "loss": 1.3908,
+      "step": 520
+    },
+    {
+      "epoch": 2.2925764192139737,
+      "grad_norm": 0.6118020705146557,
+      "learning_rate": 3.203597925265598e-05,
+      "loss": 1.3559,
+      "step": 525
+    },
+    {
+      "epoch": 2.314410480349345,
+      "grad_norm": 0.5671066923036526,
+      "learning_rate": 3.0193649271010095e-05,
+      "loss": 1.3478,
+      "step": 530
+    },
+    {
+      "epoch": 2.3362445414847164,
+      "grad_norm": 0.6001118016692729,
+      "learning_rate": 2.8396414908880098e-05,
+      "loss": 1.3509,
+      "step": 535
+    },
+    {
+      "epoch": 2.3580786026200875,
+      "grad_norm": 0.5894837707963337,
+      "learning_rate": 2.6645437198122502e-05,
+      "loss": 1.4214,
+      "step": 540
+    },
+    {
+      "epoch": 2.3799126637554586,
+      "grad_norm": 0.5845209440156467,
+      "learning_rate": 2.4941847288321797e-05,
+      "loss": 1.3788,
+      "step": 545
+    },
+    {
+      "epoch": 2.4017467248908297,
+      "grad_norm": 0.5895995554814336,
+      "learning_rate": 2.328674571605637e-05,
+      "loss": 1.391,
+      "step": 550
+    },
+    {
+      "epoch": 2.423580786026201,
+      "grad_norm": 0.6008227175127105,
+      "learning_rate": 2.1681201693940668e-05,
+      "loss": 1.4373,
+      "step": 555
+    },
+    {
+      "epoch": 2.445414847161572,
+      "grad_norm": 0.6000811157919661,
+      "learning_rate": 2.0126252419902614e-05,
+      "loss": 1.4406,
+      "step": 560
+    },
+    {
+      "epoch": 2.467248908296943,
+      "grad_norm": 0.5911207625229586,
+      "learning_rate": 1.8622902407143394e-05,
+      "loss": 1.5141,
+      "step": 565
+    },
+    {
+      "epoch": 2.489082969432314,
+      "grad_norm": 0.615377574511787,
+      "learning_rate": 1.7172122835211337e-05,
+      "loss": 1.3896,
+      "step": 570
+    },
+    {
+      "epoch": 2.5109170305676853,
+      "grad_norm": 0.5857047240018323,
+      "learning_rate": 1.577485092261012e-05,
+      "loss": 1.4468,
+      "step": 575
+    },
+    {
+      "epoch": 2.532751091703057,
+      "grad_norm": 0.6061359574672782,
+      "learning_rate": 1.4431989321345974e-05,
+      "loss": 1.4299,
+      "step": 580
+    },
+    {
+      "epoch": 2.554585152838428,
+      "grad_norm": 0.6360146523250017,
+      "learning_rate": 1.3144405533805138e-05,
+      "loss": 1.4552,
+      "step": 585
+    },
+    {
+      "epoch": 2.576419213973799,
+      "grad_norm": 0.6147809358329509,
+      "learning_rate": 1.191293135233844e-05,
+      "loss": 1.3359,
+      "step": 590
+    },
+    {
+      "epoch": 2.5982532751091703,
+      "grad_norm": 0.6021229536534441,
+      "learning_rate": 1.0738362321914997e-05,
+      "loss": 1.3927,
+      "step": 595
+    },
+    {
+      "epoch": 2.6200873362445414,
+      "grad_norm": 0.5961109142620634,
+      "learning_rate": 9.62145722619182e-06,
+      "loss": 1.4896,
+      "step": 600
+    },
+    {
+      "epoch": 2.641921397379913,
+      "grad_norm": 0.578602732768201,
+      "learning_rate": 8.562937597331899e-06,
+      "loss": 1.4565,
+      "step": 605
+    },
+    {
+      "epoch": 2.6637554585152836,
+      "grad_norm": 0.6458615704409548,
+      "learning_rate": 7.563487249887024e-06,
+      "loss": 1.4511,
+      "step": 610
+    },
+    {
+      "epoch": 2.685589519650655,
+      "grad_norm": 0.6022868302443363,
+      "learning_rate": 6.623751839046455e-06,
+      "loss": 1.3836,
+      "step": 615
+    },
+    {
+      "epoch": 2.7074235807860263,
+      "grad_norm": 0.5988174075228108,
+      "learning_rate": 5.744338443537134e-06,
+      "loss": 1.4891,
+      "step": 620
+    },
+    {
+      "epoch": 2.7292576419213974,
+      "grad_norm": 0.6026020303199385,
+      "learning_rate": 4.92581517344457e-06,
+      "loss": 1.3409,
+      "step": 625
+    },
+    {
+      "epoch": 2.7510917030567685,
+      "grad_norm": 0.5919990549417811,
+      "learning_rate": 4.168710803207865e-06,
+      "loss": 1.4157,
+      "step": 630
+    },
+    {
+      "epoch": 2.7729257641921397,
+      "grad_norm": 0.6038824964947008,
+      "learning_rate": 3.473514430026026e-06,
+      "loss": 1.4138,
+      "step": 635
+    },
+    {
+      "epoch": 2.7947598253275108,
+      "grad_norm": 0.6135553904485512,
+      "learning_rate": 2.840675157896111e-06,
+      "loss": 1.411,
+      "step": 640
+    },
+    {
+      "epoch": 2.816593886462882,
+      "grad_norm": 0.6332119474956278,
+      "learning_rate": 2.2706018074875045e-06,
+      "loss": 1.2871,
+      "step": 645
+    },
+    {
+      "epoch": 2.8384279475982535,
+      "grad_norm": 0.607982963030175,
+      "learning_rate": 1.7636626520395105e-06,
+      "loss": 1.4341,
+      "step": 650
+    },
+    {
+      "epoch": 2.8602620087336246,
+      "grad_norm": 0.6326939787843192,
+      "learning_rate": 1.3201851794530373e-06,
+      "loss": 1.3992,
+      "step": 655
+    },
+    {
+      "epoch": 2.8820960698689957,
+      "grad_norm": 0.6624316672369768,
+      "learning_rate": 9.404558807301067e-07,
+      "loss": 1.424,
+      "step": 660
+    },
+    {
+      "epoch": 2.903930131004367,
+      "grad_norm": 0.5890404802190057,
+      "learning_rate": 6.247200648976991e-07,
+      "loss": 1.3734,
+      "step": 665
+    },
+    {
+      "epoch": 2.925764192139738,
+      "grad_norm": 0.588934601286785,
+      "learning_rate": 3.7318170053559644e-07,
+      "loss": 1.413,
+      "step": 670
+    },
+    {
+      "epoch": 2.947598253275109,
+      "grad_norm": 0.6171879119635909,
+      "learning_rate": 1.8600328401061629e-07,
+      "loss": 1.3424,
+      "step": 675
+    },
+    {
+      "epoch": 2.96943231441048,
+      "grad_norm": 0.6328278329175181,
+      "learning_rate": 6.33057345022281e-08,
+      "loss": 1.3699,
+      "step": 680
+    },
+    {
+      "epoch": 2.9912663755458517,
+      "grad_norm": 0.609229235169678,
+      "learning_rate": 5.1683158875937e-09,
+      "loss": 1.3262,
+      "step": 685
+    },
+    {
+      "epoch": 3.0,
+      "step": 687,
+      "total_flos": 1562393690767360.0,
+      "train_loss": 1.5581738730184898,
+      "train_runtime": 1339.3383,
+      "train_samples_per_second": 32.797,
+      "train_steps_per_second": 0.513
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 687,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1562393690767360.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}