Upload 13 files

Browse files

Files changed (13) hide show

README.md +52 -3
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
all_results.json +14 -0
config.json +28 -0
eval_results.json +8 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +42 -0
train_results.json +9 -0
trainer_state.json +1641 -0
training_args.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,52 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+library_name: peft
+tags:
+- alignment-handbook
+- generated_from_trainer
+base_model: h2oai/h2o-danube-1.8b-base
+datasets:
+- HuggingFaceH4/ultrachat_200k
+model-index:
+- name: zephyr-danube-sft-qlora
+  results: []
+---
+**Note**: This model card has been generated automatically according to the information the Trainer had access to.
+Visit the [model card](https://ritvik19.github.io/zephyr-mini/) to see the full description.
+# zephyr-danube-sft-qlora
+This model is a fine-tuned version of [h2oai/h2o-danube-1.8b-base](https://huggingface.co/h2oai/h2o-danube-1.8b-base) on the HuggingFaceH4/ultrachat_200k dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.0893
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- gradient_accumulation_steps: 128
+- total_train_batch_size: 128
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.0883        | 0.9998 | 1140 | 1.0893          |
+### Framework versions
+- PEFT 0.7.1
+- Transformers 4.40.1
+- Pytorch 2.1.2+cu121
+- Datasets 2.19.0
+- Tokenizers 0.19.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "h2oai/h2o-danube-1.8b-base",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "q_proj",
+    "v_proj",
+    "up_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f8e9a952c22d7321bd74f842dfcf17749e2616e7af1b9f06e11bbf7b007226a
+size 34647816

all_results.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "epoch": 0.9997807498355624,
+    "eval_loss": 1.089297890663147,
+    "eval_runtime": 2019.9951,
+    "eval_samples": 23109,
+    "eval_samples_per_second": 7.996,
+    "eval_steps_per_second": 7.996,
+    "total_flos": 3.167597749864497e+18,
+    "train_loss": 0.80264539467661,
+    "train_runtime": 53655.2232,
+    "train_samples": 207864,
+    "train_samples_per_second": 2.72,
+    "train_steps_per_second": 0.021
+}

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "h2oai/h2o-danube-1.8b-base",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 6912,
+  "max_position_embeddings": 16384,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 0.9997807498355624,
+    "eval_loss": 1.089297890663147,
+    "eval_runtime": 2019.9951,
+    "eval_samples": 23109,
+    "eval_samples_per_second": 7.996,
+    "eval_steps_per_second": 7.996
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9997807498355624,
+    "total_flos": 3.167597749864497e+18,
+    "train_loss": 0.80264539467661,
+    "train_runtime": 53655.2232,
+    "train_samples": 207864,
+    "train_samples_per_second": 2.72,
+    "train_steps_per_second": 0.021
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1641 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997807498355624,
+  "eval_steps": 500,
+  "global_step": 1140,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008770006577504934,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 1.7543859649122807e-06,
+      "loss": 1.3755,
+      "step": 1
+    },
+    {
+      "epoch": 0.0043850032887524665,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 8.771929824561403e-06,
+      "loss": 1.37,
+      "step": 5
+    },
+    {
+      "epoch": 0.008770006577504933,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.3508,
+      "step": 10
+    },
+    {
+      "epoch": 0.0131550098662574,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.3572,
+      "step": 15
+    },
+    {
+      "epoch": 0.017540013155009866,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.3438,
+      "step": 20
+    },
+    {
+      "epoch": 0.021925016443762334,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 4.3859649122807014e-05,
+      "loss": 1.3356,
+      "step": 25
+    },
+    {
+      "epoch": 0.0263100197325148,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.3111,
+      "step": 30
+    },
+    {
+      "epoch": 0.030695023021267268,
+      "grad_norm": 0.095703125,
+      "learning_rate": 6.140350877192983e-05,
+      "loss": 1.2578,
+      "step": 35
+    },
+    {
+      "epoch": 0.03508002631001973,
+      "grad_norm": 0.07421875,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 1.2393,
+      "step": 40
+    },
+    {
+      "epoch": 0.0394650295987722,
+      "grad_norm": 0.064453125,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 1.2206,
+      "step": 45
+    },
+    {
+      "epoch": 0.04385003288752467,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 1.1976,
+      "step": 50
+    },
+    {
+      "epoch": 0.048235036176277134,
+      "grad_norm": 0.044921875,
+      "learning_rate": 9.649122807017544e-05,
+      "loss": 1.1976,
+      "step": 55
+    },
+    {
+      "epoch": 0.0526200394650296,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.1789,
+      "step": 60
+    },
+    {
+      "epoch": 0.05700504275378206,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.00011403508771929824,
+      "loss": 1.1716,
+      "step": 65
+    },
+    {
+      "epoch": 0.061390046042534535,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 1.1691,
+      "step": 70
+    },
+    {
+      "epoch": 0.065775049331287,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 1.1533,
+      "step": 75
+    },
+    {
+      "epoch": 0.07016005262003946,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 1.1353,
+      "step": 80
+    },
+    {
+      "epoch": 0.07454505590879193,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 0.00014912280701754387,
+      "loss": 1.1404,
+      "step": 85
+    },
+    {
+      "epoch": 0.0789300591975444,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 1.1498,
+      "step": 90
+    },
+    {
+      "epoch": 0.08331506248629686,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 1.1125,
+      "step": 95
+    },
+    {
+      "epoch": 0.08770006577504934,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 1.1447,
+      "step": 100
+    },
+    {
+      "epoch": 0.0920850690638018,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 1.1351,
+      "step": 105
+    },
+    {
+      "epoch": 0.09647007235255427,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 1.1294,
+      "step": 110
+    },
+    {
+      "epoch": 0.10085507564130673,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00019999953121394002,
+      "loss": 1.1315,
+      "step": 115
+    },
+    {
+      "epoch": 0.1052400789300592,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.00019998312416333227,
+      "loss": 1.1284,
+      "step": 120
+    },
+    {
+      "epoch": 0.10962508221881166,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.00019994328220474688,
+      "loss": 1.136,
+      "step": 125
+    },
+    {
+      "epoch": 0.11401008550756413,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 0.0001998800146766861,
+      "loss": 1.1213,
+      "step": 130
+    },
+    {
+      "epoch": 0.11839508879631659,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.00019979333640833947,
+      "loss": 1.1152,
+      "step": 135
+    },
+    {
+      "epoch": 0.12278009208506907,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.00019968326771610797,
+      "loss": 1.1193,
+      "step": 140
+    },
+    {
+      "epoch": 0.12716509537382154,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001995498343988421,
+      "loss": 1.121,
+      "step": 145
+    },
+    {
+      "epoch": 0.131550098662574,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.00019939306773179497,
+      "loss": 1.1291,
+      "step": 150
+    },
+    {
+      "epoch": 0.13593510195132646,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001992130044592916,
+      "loss": 1.122,
+      "step": 155
+    },
+    {
+      "epoch": 0.14032010524007893,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.00019900968678611666,
+      "loss": 1.1174,
+      "step": 160
+    },
+    {
+      "epoch": 0.1447051085288314,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.00019878316236762196,
+      "loss": 1.1205,
+      "step": 165
+    },
+    {
+      "epoch": 0.14909011181758386,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.00019853348429855672,
+      "loss": 1.1086,
+      "step": 170
+    },
+    {
+      "epoch": 0.15347511510633632,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001982607111006227,
+      "loss": 1.1086,
+      "step": 175
+    },
+    {
+      "epoch": 0.1578601183950888,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001979649067087574,
+      "loss": 1.1202,
+      "step": 180
+    },
+    {
+      "epoch": 0.16224512168384125,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.00019764614045614836,
+      "loss": 1.1248,
+      "step": 185
+    },
+    {
+      "epoch": 0.16663012497259372,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.00019730448705798239,
+      "loss": 1.1231,
+      "step": 190
+    },
+    {
+      "epoch": 0.17101512826134618,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.00019694002659393305,
+      "loss": 1.102,
+      "step": 195
+    },
+    {
+      "epoch": 0.17540013155009868,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.00019655284448939094,
+      "loss": 1.1203,
+      "step": 200
+    },
+    {
+      "epoch": 0.17978513483885114,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.00019614303149544102,
+      "loss": 1.1073,
+      "step": 205
+    },
+    {
+      "epoch": 0.1841701381276036,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00019571068366759143,
+      "loss": 1.1082,
+      "step": 210
+    },
+    {
+      "epoch": 0.18855514141635607,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.00019525590234325933,
+      "loss": 1.1114,
+      "step": 215
+    },
+    {
+      "epoch": 0.19294014470510853,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.00019477879411801844,
+      "loss": 1.1007,
+      "step": 220
+    },
+    {
+      "epoch": 0.197325147993861,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.00019427947082061432,
+      "loss": 1.0978,
+      "step": 225
+    },
+    {
+      "epoch": 0.20171015128261346,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.00019375804948675306,
+      "loss": 1.0876,
+      "step": 230
+    },
+    {
+      "epoch": 0.20609515457136593,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.00019321465233166924,
+      "loss": 1.1182,
+      "step": 235
+    },
+    {
+      "epoch": 0.2104801578601184,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.00019264940672148018,
+      "loss": 1.0853,
+      "step": 240
+    },
+    {
+      "epoch": 0.21486516114887086,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.00019206244514333282,
+      "loss": 1.1151,
+      "step": 245
+    },
+    {
+      "epoch": 0.21925016443762332,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.00019145390517435012,
+      "loss": 1.111,
+      "step": 250
+    },
+    {
+      "epoch": 0.2236351677263758,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.00019082392944938466,
+      "loss": 1.1262,
+      "step": 255
+    },
+    {
+      "epoch": 0.22802017101512825,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.00019017266562758659,
+      "loss": 1.1036,
+      "step": 260
+    },
+    {
+      "epoch": 0.23240517430388072,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.00018950026635779397,
+      "loss": 1.1162,
+      "step": 265
+    },
+    {
+      "epoch": 0.23679017759263318,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.00018880688924275378,
+      "loss": 1.102,
+      "step": 270
+    },
+    {
+      "epoch": 0.24117518088138565,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.00018809269680218136,
+      "loss": 1.1,
+      "step": 275
+    },
+    {
+      "epoch": 0.24556018417013814,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.00018735785643466784,
+      "loss": 1.1037,
+      "step": 280
+    },
+    {
+      "epoch": 0.2499451874588906,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 1.0826,
+      "step": 285
+    },
+    {
+      "epoch": 0.25433019074764307,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.00018582692567100867,
+      "loss": 1.1046,
+      "step": 290
+    },
+    {
+      "epoch": 0.25871519403639553,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001850311941076346,
+      "loss": 1.082,
+      "step": 295
+    },
+    {
+      "epoch": 0.263100197325148,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.00018421553219875658,
+      "loss": 1.0945,
+      "step": 300
+    },
+    {
+      "epoch": 0.26748520061390046,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.00018338013112625587,
+      "loss": 1.0985,
+      "step": 305
+    },
+    {
+      "epoch": 0.27187020390265293,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.00018252518669864936,
+      "loss": 1.1186,
+      "step": 310
+    },
+    {
+      "epoch": 0.2762552071914054,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001816508993051943,
+      "loss": 1.1253,
+      "step": 315
+    },
+    {
+      "epoch": 0.28064021048015786,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001807574738689193,
+      "loss": 1.0904,
+      "step": 320
+    },
+    {
+      "epoch": 0.2850252137689103,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.00017984511979859263,
+      "loss": 1.109,
+      "step": 325
+    },
+    {
+      "epoch": 0.2894102170576628,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 1.1041,
+      "step": 330
+    },
+    {
+      "epoch": 0.29379522034641525,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.00017796448552401825,
+      "loss": 1.0927,
+      "step": 335
+    },
+    {
+      "epoch": 0.2981802236351677,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.00017699664611907072,
+      "loss": 1.1041,
+      "step": 340
+    },
+    {
+      "epoch": 0.3025652269239202,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 0.00017601075957535364,
+      "loss": 1.1115,
+      "step": 345
+    },
+    {
+      "epoch": 0.30695023021267265,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001750070569734681,
+      "loss": 1.1088,
+      "step": 350
+    },
+    {
+      "epoch": 0.3113352335014251,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.00017398577356989665,
+      "loss": 1.0905,
+      "step": 355
+    },
+    {
+      "epoch": 0.3157202367901776,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 0.0001729471487418621,
+      "loss": 1.1071,
+      "step": 360
+    },
+    {
+      "epoch": 0.32010524007893004,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.00017189142593121993,
+      "loss": 1.0872,
+      "step": 365
+    },
+    {
+      "epoch": 0.3244902433676825,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.00017081885258739846,
+      "loss": 1.1054,
+      "step": 370
+    },
+    {
+      "epoch": 0.32887524665643497,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.00016972968010939954,
+      "loss": 1.1035,
+      "step": 375
+    },
+    {
+      "epoch": 0.33326024994518744,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001686241637868734,
+      "loss": 1.0976,
+      "step": 380
+    },
+    {
+      "epoch": 0.3376452532339399,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.00016750256274028152,
+      "loss": 1.099,
+      "step": 385
+    },
+    {
+      "epoch": 0.34203025652269237,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.00016636513986016213,
+      "loss": 1.1267,
+      "step": 390
+    },
+    {
+      "epoch": 0.34641525981144483,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001652121617455113,
+      "loss": 1.101,
+      "step": 395
+    },
+    {
+      "epoch": 0.35080026310019735,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.00016404389864129533,
+      "loss": 1.1005,
+      "step": 400
+    },
+    {
+      "epoch": 0.3551852663889498,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001628606243751082,
+      "loss": 1.0973,
+      "step": 405
+    },
+    {
+      "epoch": 0.3595702696777023,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.00016166261629298995,
+      "loss": 1.1016,
+      "step": 410
+    },
+    {
+      "epoch": 0.36395527296645475,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001604501551944193,
+      "loss": 1.0863,
+      "step": 415
+    },
+    {
+      "epoch": 0.3683402762552072,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.00015922352526649803,
+      "loss": 1.1008,
+      "step": 420
+    },
+    {
+      "epoch": 0.3727252795439597,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001579830140173403,
+      "loss": 1.0999,
+      "step": 425
+    },
+    {
+      "epoch": 0.37711028283271214,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.00015672891220868432,
+      "loss": 1.0944,
+      "step": 430
+    },
+    {
+      "epoch": 0.3814952861214646,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.00015546151378774086,
+      "loss": 1.084,
+      "step": 435
+    },
+    {
+      "epoch": 0.38588028941021707,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.00015418111581829574,
+      "loss": 1.0957,
+      "step": 440
+    },
+    {
+      "epoch": 0.39026529269896953,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.00015288801841108093,
+      "loss": 1.0823,
+      "step": 445
+    },
+    {
+      "epoch": 0.394650295987722,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.00015158252465343242,
+      "loss": 1.0925,
+      "step": 450
+    },
+    {
+      "epoch": 0.39903529927647446,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00015026494053824982,
+      "loss": 1.0917,
+      "step": 455
+    },
+    {
+      "epoch": 0.40342030256522693,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00014893557489227517,
+      "loss": 1.0935,
+      "step": 460
+    },
+    {
+      "epoch": 0.4078053058539794,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.00014759473930370736,
+      "loss": 1.0795,
+      "step": 465
+    },
+    {
+      "epoch": 0.41219030914273186,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.00014624274804916958,
+      "loss": 1.0943,
+      "step": 470
+    },
+    {
+      "epoch": 0.4165753124314843,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.00014487991802004623,
+      "loss": 1.1022,
+      "step": 475
+    },
+    {
+      "epoch": 0.4209603157202368,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.00014350656864820733,
+      "loss": 1.0849,
+      "step": 480
+    },
+    {
+      "epoch": 0.42534531900898925,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.00014212302183113732,
+      "loss": 1.0865,
+      "step": 485
+    },
+    {
+      "epoch": 0.4297303222977417,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.00014072960185648577,
+      "loss": 1.106,
+      "step": 490
+    },
+    {
+      "epoch": 0.4341153255864942,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001393266353260583,
+      "loss": 1.0926,
+      "step": 495
+    },
+    {
+      "epoch": 0.43850032887524665,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.00013791445107926478,
+      "loss": 1.1091,
+      "step": 500
+    },
+    {
+      "epoch": 0.4428853321639991,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001364933801160428,
+      "loss": 1.0858,
+      "step": 505
+    },
+    {
+      "epoch": 0.4472703354527516,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.00013506375551927547,
+      "loss": 1.0901,
+      "step": 510
+    },
+    {
+      "epoch": 0.45165533874150404,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001336259123767203,
+      "loss": 1.0852,
+      "step": 515
+    },
+    {
+      "epoch": 0.4560403420302565,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.00013218018770246858,
+      "loss": 1.0838,
+      "step": 520
+    },
+    {
+      "epoch": 0.46042534531900897,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.00013072692035795305,
+      "loss": 1.0795,
+      "step": 525
+    },
+    {
+      "epoch": 0.46481034860776144,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001292664509725226,
+      "loss": 1.1038,
+      "step": 530
+    },
+    {
+      "epoch": 0.4691953518965139,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.00012779912186360268,
+      "loss": 1.0937,
+      "step": 535
+    },
+    {
+      "epoch": 0.47358035518526637,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.00012632527695645993,
+      "loss": 1.08,
+      "step": 540
+    },
+    {
+      "epoch": 0.47796535847401883,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.00012484526170359012,
+      "loss": 1.0642,
+      "step": 545
+    },
+    {
+      "epoch": 0.4823503617627713,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.00012335942300374788,
+      "loss": 1.089,
+      "step": 550
+    },
+    {
+      "epoch": 0.4867353650515238,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001218681091206376,
+      "loss": 1.0893,
+      "step": 555
+    },
+    {
+      "epoch": 0.4911203683402763,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.00012037166960128443,
+      "loss": 1.0996,
+      "step": 560
+    },
+    {
+      "epoch": 0.49550537162902875,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.00011887045519410442,
+      "loss": 1.0955,
+      "step": 565
+    },
+    {
+      "epoch": 0.4998903749177812,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 1.0904,
+      "step": 570
+    },
+    {
+      "epoch": 0.5042753782065337,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 0.00011585511022335142,
+      "loss": 1.1074,
+      "step": 575
+    },
+    {
+      "epoch": 0.5086603814952861,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.00011434168642236964,
+      "loss": 1.0855,
+      "step": 580
+    },
+    {
+      "epoch": 0.5130453847840386,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.00011282490109308633,
+      "loss": 1.0872,
+      "step": 585
+    },
+    {
+      "epoch": 0.5174303880727911,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.00011130510975274409,
+      "loss": 1.0824,
+      "step": 590
+    },
+    {
+      "epoch": 0.5218153913615435,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001097826686231604,
+      "loss": 1.1002,
+      "step": 595
+    },
+    {
+      "epoch": 0.526200394650296,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.00010825793454723325,
+      "loss": 1.083,
+      "step": 600
+    },
+    {
+      "epoch": 0.5305853979390485,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.00010673126490530112,
+      "loss": 1.1003,
+      "step": 605
+    },
+    {
+      "epoch": 0.5349704012278009,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.00010520301753137724,
+      "loss": 1.0852,
+      "step": 610
+    },
+    {
+      "epoch": 0.5393554045165534,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.00010367355062927726,
+      "loss": 1.0904,
+      "step": 615
+    },
+    {
+      "epoch": 0.5437404078053059,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.00010214322268866032,
+      "loss": 1.0839,
+      "step": 620
+    },
+    {
+      "epoch": 0.5481254110940583,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.00010061239240100327,
+      "loss": 1.079,
+      "step": 625
+    },
+    {
+      "epoch": 0.5525104143828108,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 9.908141857552737e-05,
+      "loss": 1.0987,
+      "step": 630
+    },
+    {
+      "epoch": 0.5568954176715633,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 9.755066005509753e-05,
+      "loss": 1.075,
+      "step": 635
+    },
+    {
+      "epoch": 0.5612804209603157,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 9.602047563211359e-05,
+      "loss": 1.0803,
+      "step": 640
+    },
+    {
+      "epoch": 0.5656654242490682,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 9.449122396441345e-05,
+      "loss": 1.1114,
+      "step": 645
+    },
+    {
+      "epoch": 0.5700504275378206,
+      "grad_norm": 0.048828125,
+      "learning_rate": 9.296326349120785e-05,
+      "loss": 1.0771,
+      "step": 650
+    },
+    {
+      "epoch": 0.5744354308265731,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 9.143695234906611e-05,
+      "loss": 1.0917,
+      "step": 655
+    },
+    {
+      "epoch": 0.5788204341153256,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 8.991264828797319e-05,
+      "loss": 1.0843,
+      "step": 660
+    },
+    {
+      "epoch": 0.583205437404078,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 8.839070858747697e-05,
+      "loss": 1.0863,
+      "step": 665
+    },
+    {
+      "epoch": 0.5875904406928305,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 8.687148997294621e-05,
+      "loss": 1.086,
+      "step": 670
+    },
+    {
+      "epoch": 0.591975443981583,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 8.535534853195786e-05,
+      "loss": 1.08,
+      "step": 675
+    },
+    {
+      "epoch": 0.5963604472703354,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 8.384263963083453e-05,
+      "loss": 1.0686,
+      "step": 680
+    },
+    {
+      "epoch": 0.6007454505590879,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 8.23337178313504e-05,
+      "loss": 1.075,
+      "step": 685
+    },
+    {
+      "epoch": 0.6051304538478404,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 8.082893680762619e-05,
+      "loss": 1.0926,
+      "step": 690
+    },
+    {
+      "epoch": 0.6095154571365928,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 7.932864926323161e-05,
+      "loss": 1.079,
+      "step": 695
+    },
+    {
+      "epoch": 0.6139004604253453,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 7.783320684851614e-05,
+      "loss": 1.0844,
+      "step": 700
+    },
+    {
+      "epoch": 0.6182854637140978,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 7.634296007818576e-05,
+      "loss": 1.1056,
+      "step": 705
+    },
+    {
+      "epoch": 0.6226704670028502,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 7.485825824914659e-05,
+      "loss": 1.0851,
+      "step": 710
+    },
+    {
+      "epoch": 0.6270554702916027,
+      "grad_norm": 0.048828125,
+      "learning_rate": 7.337944935863333e-05,
+      "loss": 1.0786,
+      "step": 715
+    },
+    {
+      "epoch": 0.6314404735803552,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 7.190688002264308e-05,
+      "loss": 1.089,
+      "step": 720
+    },
+    {
+      "epoch": 0.6358254768691076,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 7.044089539469212e-05,
+      "loss": 1.0826,
+      "step": 725
+    },
+    {
+      "epoch": 0.6402104801578601,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 6.898183908491617e-05,
+      "loss": 1.1004,
+      "step": 730
+    },
+    {
+      "epoch": 0.6445954834466125,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 6.753005307953167e-05,
+      "loss": 1.0722,
+      "step": 735
+    },
+    {
+      "epoch": 0.648980486735365,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 6.608587766067852e-05,
+      "loss": 1.0859,
+      "step": 740
+    },
+    {
+      "epoch": 0.6533654900241175,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 6.464965132666163e-05,
+      "loss": 1.1088,
+      "step": 745
+    },
+    {
+      "epoch": 0.6577504933128699,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 6.322171071261071e-05,
+      "loss": 1.0726,
+      "step": 750
+    },
+    {
+      "epoch": 0.6621354966016224,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 6.180239051157681e-05,
+      "loss": 1.0897,
+      "step": 755
+    },
+    {
+      "epoch": 0.6665204998903749,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 6.039202339608432e-05,
+      "loss": 1.0972,
+      "step": 760
+    },
+    {
+      "epoch": 0.6709055031791273,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 5.8990939940156e-05,
+      "loss": 1.0884,
+      "step": 765
+    },
+    {
+      "epoch": 0.6752905064678798,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 5.7599468541830356e-05,
+      "loss": 1.086,
+      "step": 770
+    },
+    {
+      "epoch": 0.6796755097566323,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 5.62179353461888e-05,
+      "loss": 1.0921,
+      "step": 775
+    },
+    {
+      "epoch": 0.6840605130453847,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 5.484666416891109e-05,
+      "loss": 1.0834,
+      "step": 780
+    },
+    {
+      "epoch": 0.6884455163341372,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 5.3485976420376336e-05,
+      "loss": 1.0827,
+      "step": 785
+    },
+    {
+      "epoch": 0.6928305196228897,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 5.2136191030328455e-05,
+      "loss": 1.0851,
+      "step": 790
+    },
+    {
+      "epoch": 0.6972155229116422,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 5.079762437312219e-05,
+      "loss": 1.0834,
+      "step": 795
+    },
+    {
+      "epoch": 0.7016005262003947,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 4.9470590193569044e-05,
+      "loss": 1.1016,
+      "step": 800
+    },
+    {
+      "epoch": 0.7059855294891472,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 4.815539953339865e-05,
+      "loss": 1.0686,
+      "step": 805
+    },
+    {
+      "epoch": 0.7103705327778996,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 4.685236065835443e-05,
+      "loss": 1.086,
+      "step": 810
+    },
+    {
+      "epoch": 0.7147555360666521,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 4.5561778985939366e-05,
+      "loss": 1.0817,
+      "step": 815
+    },
+    {
+      "epoch": 0.7191405393554046,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 4.4283957013829846e-05,
+      "loss": 1.0837,
+      "step": 820
+    },
+    {
+      "epoch": 0.723525542644157,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 4.301919424897338e-05,
+      "loss": 1.0791,
+      "step": 825
+    },
+    {
+      "epoch": 0.7279105459329095,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 4.176778713738787e-05,
+      "loss": 1.0865,
+      "step": 830
+    },
+    {
+      "epoch": 0.732295549221662,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 4.053002899467774e-05,
+      "loss": 1.0842,
+      "step": 835
+    },
+    {
+      "epoch": 0.7366805525104144,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 3.9306209937284346e-05,
+      "loss": 1.0939,
+      "step": 840
+    },
+    {
+      "epoch": 0.7410655557991669,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 3.809661681448576e-05,
+      "loss": 1.0941,
+      "step": 845
+    },
+    {
+      "epoch": 0.7454505590879194,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 3.69015331411628e-05,
+      "loss": 1.0664,
+      "step": 850
+    },
+    {
+      "epoch": 0.7498355623766718,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 1.0844,
+      "step": 855
+    },
+    {
+      "epoch": 0.7542205656654243,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 3.455601113256073e-05,
+      "loss": 1.1036,
+      "step": 860
+    },
+    {
+      "epoch": 0.7586055689541767,
+      "grad_norm": 0.048828125,
+      "learning_rate": 3.340612256098316e-05,
+      "loss": 1.0641,
+      "step": 865
+    },
+    {
+      "epoch": 0.7629905722429292,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 3.227184283742591e-05,
+      "loss": 1.087,
+      "step": 870
+    },
+    {
+      "epoch": 0.7673755755316817,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 3.115343782416483e-05,
+      "loss": 1.0992,
+      "step": 875
+    },
+    {
+      "epoch": 0.7717605788204341,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 3.0051169662624225e-05,
+      "loss": 1.0838,
+      "step": 880
+    },
+    {
+      "epoch": 0.7761455821091866,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 2.89652967119336e-05,
+      "loss": 1.0899,
+      "step": 885
+    },
+    {
+      "epoch": 0.7805305853979391,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 2.789607348837153e-05,
+      "loss": 1.0906,
+      "step": 890
+    },
+    {
+      "epoch": 0.7849155886866915,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 2.684375060570965e-05,
+      "loss": 1.0897,
+      "step": 895
+    },
+    {
+      "epoch": 0.789300591975444,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 2.5808574716471856e-05,
+      "loss": 1.0857,
+      "step": 900
+    },
+    {
+      "epoch": 0.7936855952641965,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 2.4790788454121584e-05,
+      "loss": 1.0973,
+      "step": 905
+    },
+    {
+      "epoch": 0.7980705985529489,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 2.379063037619146e-05,
+      "loss": 1.0714,
+      "step": 910
+    },
+    {
+      "epoch": 0.8024556018417014,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 2.2808334908367914e-05,
+      "loss": 1.0919,
+      "step": 915
+    },
+    {
+      "epoch": 0.8068406051304539,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 2.184413228954468e-05,
+      "loss": 1.082,
+      "step": 920
+    },
+    {
+      "epoch": 0.8112256084192063,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 2.0898248517857256e-05,
+      "loss": 1.091,
+      "step": 925
+    },
+    {
+      "epoch": 0.8156106117079588,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 1.9970905297711606e-05,
+      "loss": 1.0919,
+      "step": 930
+    },
+    {
+      "epoch": 0.8199956149967113,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 1.9062319987819067e-05,
+      "loss": 1.0668,
+      "step": 935
+    },
+    {
+      "epoch": 0.8243806182854637,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 1.8172705550250092e-05,
+      "loss": 1.0912,
+      "step": 940
+    },
+    {
+      "epoch": 0.8287656215742162,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 1.7302270500518182e-05,
+      "loss": 1.0886,
+      "step": 945
+    },
+    {
+      "epoch": 0.8331506248629686,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 1.6451218858706374e-05,
+      "loss": 1.0815,
+      "step": 950
+    },
+    {
+      "epoch": 0.8375356281517211,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 1.5619750101647114e-05,
+      "loss": 1.1055,
+      "step": 955
+    },
+    {
+      "epoch": 0.8419206314404736,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 1.4808059116167305e-05,
+      "loss": 1.0854,
+      "step": 960
+    },
+    {
+      "epoch": 0.846305634729226,
+      "grad_norm": 0.048828125,
+      "learning_rate": 1.4016336153408893e-05,
+      "loss": 1.1044,
+      "step": 965
+    },
+    {
+      "epoch": 0.8506906380179785,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 1.3244766784236307e-05,
+      "loss": 1.103,
+      "step": 970
+    },
+    {
+      "epoch": 0.855075641306731,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 1.2493531855740625e-05,
+      "loss": 1.0638,
+      "step": 975
+    },
+    {
+      "epoch": 0.8594606445954834,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 1.176280744885121e-05,
+      "loss": 1.067,
+      "step": 980
+    },
+    {
+      "epoch": 0.8638456478842359,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 1.1052764837064178e-05,
+      "loss": 1.0787,
+      "step": 985
+    },
+    {
+      "epoch": 0.8682306511729884,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 1.0363570446297999e-05,
+      "loss": 1.0825,
+      "step": 990
+    },
+    {
+      "epoch": 0.8726156544617408,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 9.695385815885016e-06,
+      "loss": 1.0905,
+      "step": 995
+    },
+    {
+      "epoch": 0.8770006577504933,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 9.048367560708604e-06,
+      "loss": 1.0723,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8813856610392458,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 8.422667334494249e-06,
+      "loss": 1.1059,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8857706643279982,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 7.818431794263836e-06,
+      "loss": 1.1027,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8901556676167507,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 7.235802565960714e-06,
+      "loss": 1.0733,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8945406709055032,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 6.674916211254289e-06,
+      "loss": 1.0807,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8989256741942556,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 6.1359041955315725e-06,
+      "loss": 1.0729,
+      "step": 1025
+    },
+    {
+      "epoch": 0.9033106774830081,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 5.618892857083069e-06,
+      "loss": 1.0816,
+      "step": 1030
+    },
+    {
+      "epoch": 0.9076956807717605,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 5.124003377490582e-06,
+      "loss": 1.0853,
+      "step": 1035
+    },
+    {
+      "epoch": 0.912080684060513,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 4.65135175322361e-06,
+      "loss": 1.0829,
+      "step": 1040
+    },
+    {
+      "epoch": 0.9164656873492655,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 4.20104876845111e-06,
+      "loss": 1.0797,
+      "step": 1045
+    },
+    {
+      "epoch": 0.9208506906380179,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 3.7731999690749585e-06,
+      "loss": 1.0908,
+      "step": 1050
+    },
+    {
+      "epoch": 0.9252356939267704,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 3.367905637991142e-06,
+      "loss": 1.0784,
+      "step": 1055
+    },
+    {
+      "epoch": 0.9296206972155229,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 2.9852607715846193e-06,
+      "loss": 1.101,
+      "step": 1060
+    },
+    {
+      "epoch": 0.9340057005042753,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 2.6253550574632303e-06,
+      "loss": 1.1063,
+      "step": 1065
+    },
+    {
+      "epoch": 0.9383907037930278,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 2.288272853436013e-06,
+      "loss": 1.0768,
+      "step": 1070
+    },
+    {
+      "epoch": 0.9427757070817803,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 1.974093167740565e-06,
+      "loss": 1.0755,
+      "step": 1075
+    },
+    {
+      "epoch": 0.9471607103705327,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 1.6828896405244988e-06,
+      "loss": 1.0714,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9515457136592852,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 1.4147305265850175e-06,
+      "loss": 1.085,
+      "step": 1085
+    },
+    {
+      "epoch": 0.9559307169480377,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 1.1696786793707781e-06,
+      "loss": 1.0816,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9603157202367901,
+      "grad_norm": 0.046875,
+      "learning_rate": 9.477915362496758e-07,
+      "loss": 1.0925,
+      "step": 1095
+    },
+    {
+      "epoch": 0.9647007235255426,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 7.491211050462798e-07,
+      "loss": 1.0662,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9690857268142951,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 5.737139518517509e-07,
+      "loss": 1.0941,
+      "step": 1105
+    },
+    {
+      "epoch": 0.9734707301030476,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 4.216111901092501e-07,
+      "loss": 1.086,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9778557333918001,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 2.9284847097746923e-07,
+      "loss": 1.0768,
+      "step": 1115
+    },
+    {
+      "epoch": 0.9822407366805526,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 1.8745597497433765e-07,
+      "loss": 1.0733,
+      "step": 1120
+    },
+    {
+      "epoch": 0.986625739969305,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 1.0545840490313596e-07,
+      "loss": 1.0981,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9910107432580575,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 4.687498006236135e-08,
+      "loss": 1.1037,
+      "step": 1130
+    },
+    {
+      "epoch": 0.99539574654681,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 1.1719431740997433e-08,
+      "loss": 1.0715,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9997807498355624,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0,
+      "loss": 1.0883,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9997807498355624,
+      "eval_loss": 1.089297890663147,
+      "eval_runtime": 2048.9326,
+      "eval_samples_per_second": 7.883,
+      "eval_steps_per_second": 7.883,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9997807498355624,
+      "step": 1140,
+      "total_flos": 3.167597749864497e+18,
+      "train_loss": 0.80264539467661,
+      "train_runtime": 53655.2232,
+      "train_samples_per_second": 2.72,
+      "train_steps_per_second": 0.021
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1140,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "total_flos": 3.167597749864497e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f97d97dd851eba30db4d442104f8929ef92832e214a10988afea71c04c934f3
+size 5112