Training in progress, step 95000, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/adapter_config.json +3 -3
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/tokenizer_config.json +0 -1
last-checkpoint/trainer_state.json +430 -3
last-checkpoint/training_args.bin +2 -2

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -20,13 +20,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "down_proj",
     "up_proj",
-    "q_proj",
     "k_proj",
     "gate_proj",
     "v_proj",
-    "o_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "up_proj",
     "k_proj",
     "gate_proj",
+    "down_proj",
+    "o_proj",
     "v_proj",
+    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:588a15f7cea8d0a814b43f40bdcbefaf553e3f1ea3fe8f93ab138197ad9ca78c
 size 5544997664

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c8c2bc861c786b2bf9d8ddb8858babedad8fc42c6e26fb00fe13b35096c6de7
 size 5544997664

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3ebee6fad5e5c1bb9d7e488846379921805f8c0c20003d8a91f2e25aed77a83a
 size 674093138

 version https://git-lfs.github.com/spec/v1
+oid sha256:36ebe5440e2bcb412e4131df2efca8e8fc88b5200168c85a419cb901604336b6
 size 674093138

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6d8f8f9743ec8d3f95f6d09874e8c8e1665b1753c549b2fad6b80c9e2a59f8a6
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:0610f4aa7ed2f34398fce8dc77c3d7b14d52dfb0bc17dc7f64e8f6c2438e189b
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:def38347a8476c414f9a59b485e01231d01480373c9bf5d7882acb65a1218490
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:17feec1222485652df46ab05d04d0cb1b6896f1f053ea3ae8ca19c7cd689e6b7
 size 1064

last-checkpoint/tokenizer_config.json CHANGED Viewed

@@ -2072,7 +2072,6 @@
   "bos_token": "<|im_start|>",
   "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
   "clean_up_tokenization_spaces": true,
-  "device_map": "auto",
   "eos_token": "<|im_end|>",
   "max_length": 4096,
   "model_input_names": [

   "bos_token": "<|im_start|>",
   "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|im_end|>",
   "max_length": 4096,
   "model_input_names": [

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.06611336694777863,
   "eval_steps": 200,
-  "global_step": 88900,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -6230,6 +6230,433 @@
       "learning_rate": 1.9991374234676826e-05,
       "loss": 1.5551,
       "step": 88900
     }
   ],
   "logging_steps": 100,
@@ -6249,7 +6676,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.211678461812007e+18,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.07064982969672631,
   "eval_steps": 200,
+  "global_step": 95000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 1.9991374234676826e-05,
       "loss": 1.5551,
       "step": 88900
+    },
+    {
+      "epoch": 0.06618773518956465,
+      "grad_norm": 0.6733551621437073,
+      "learning_rate": 1.999135481992186e-05,
+      "loss": 1.4334,
+      "step": 89000
+    },
+    {
+      "epoch": 0.06626210343135068,
+      "grad_norm": 0.8035016059875488,
+      "learning_rate": 1.999133538335166e-05,
+      "loss": 1.4872,
+      "step": 89100
+    },
+    {
+      "epoch": 0.06633647167313671,
+      "grad_norm": 0.4339046776294708,
+      "learning_rate": 1.9991315924966277e-05,
+      "loss": 1.4869,
+      "step": 89200
+    },
+    {
+      "epoch": 0.06641083991492273,
+      "grad_norm": 0.6680594086647034,
+      "learning_rate": 1.9991296444765747e-05,
+      "loss": 1.5103,
+      "step": 89300
+    },
+    {
+      "epoch": 0.06648520815670876,
+      "grad_norm": 0.697487473487854,
+      "learning_rate": 1.9991276942750117e-05,
+      "loss": 1.4239,
+      "step": 89400
+    },
+    {
+      "epoch": 0.06655957639849479,
+      "grad_norm": 0.587734043598175,
+      "learning_rate": 1.9991257418919424e-05,
+      "loss": 1.5856,
+      "step": 89500
+    },
+    {
+      "epoch": 0.06663394464028081,
+      "grad_norm": 0.8574571013450623,
+      "learning_rate": 1.999123787327372e-05,
+      "loss": 1.4818,
+      "step": 89600
+    },
+    {
+      "epoch": 0.06670831288206684,
+      "grad_norm": 1.0861676931381226,
+      "learning_rate": 1.9991218305813035e-05,
+      "loss": 1.4883,
+      "step": 89700
+    },
+    {
+      "epoch": 0.06678268112385287,
+      "grad_norm": 1.0139306783676147,
+      "learning_rate": 1.9991198716537422e-05,
+      "loss": 1.5099,
+      "step": 89800
+    },
+    {
+      "epoch": 0.0668570493656389,
+      "grad_norm": 0.6741511225700378,
+      "learning_rate": 1.999117910544692e-05,
+      "loss": 1.4746,
+      "step": 89900
+    },
+    {
+      "epoch": 0.06693141760742492,
+      "grad_norm": 0.9702801704406738,
+      "learning_rate": 1.999115947254157e-05,
+      "loss": 1.5166,
+      "step": 90000
+    },
+    {
+      "epoch": 0.06700578584921095,
+      "grad_norm": 0.7757803797721863,
+      "learning_rate": 1.9991139817821416e-05,
+      "loss": 1.5031,
+      "step": 90100
+    },
+    {
+      "epoch": 0.06708015409099698,
+      "grad_norm": 0.7200698256492615,
+      "learning_rate": 1.9991120141286502e-05,
+      "loss": 1.5834,
+      "step": 90200
+    },
+    {
+      "epoch": 0.067154522332783,
+      "grad_norm": 0.7415780425071716,
+      "learning_rate": 1.999110044293687e-05,
+      "loss": 1.5689,
+      "step": 90300
+    },
+    {
+      "epoch": 0.06722889057456903,
+      "grad_norm": 0.5777677297592163,
+      "learning_rate": 1.9991080722772564e-05,
+      "loss": 1.5139,
+      "step": 90400
+    },
+    {
+      "epoch": 0.06730325881635506,
+      "grad_norm": 0.6991866827011108,
+      "learning_rate": 1.999106098079363e-05,
+      "loss": 1.5073,
+      "step": 90500
+    },
+    {
+      "epoch": 0.06737762705814108,
+      "grad_norm": 0.6112390160560608,
+      "learning_rate": 1.9991041217000105e-05,
+      "loss": 1.4773,
+      "step": 90600
+    },
+    {
+      "epoch": 0.06745199529992713,
+      "grad_norm": 0.8287676572799683,
+      "learning_rate": 1.9991021431392033e-05,
+      "loss": 1.5425,
+      "step": 90700
+    },
+    {
+      "epoch": 0.06752636354171315,
+      "grad_norm": 0.8582881689071655,
+      "learning_rate": 1.999100162396946e-05,
+      "loss": 1.5581,
+      "step": 90800
+    },
+    {
+      "epoch": 0.06760073178349918,
+      "grad_norm": 0.5585276484489441,
+      "learning_rate": 1.999098179473243e-05,
+      "loss": 1.5015,
+      "step": 90900
+    },
+    {
+      "epoch": 0.0676751000252852,
+      "grad_norm": 0.4237435460090637,
+      "learning_rate": 1.9990961943680984e-05,
+      "loss": 1.523,
+      "step": 91000
+    },
+    {
+      "epoch": 0.06774946826707123,
+      "grad_norm": 0.5455594658851624,
+      "learning_rate": 1.999094207081517e-05,
+      "loss": 1.5448,
+      "step": 91100
+    },
+    {
+      "epoch": 0.06782383650885726,
+      "grad_norm": 0.48855817317962646,
+      "learning_rate": 1.999092217613502e-05,
+      "loss": 1.4535,
+      "step": 91200
+    },
+    {
+      "epoch": 0.06789820475064329,
+      "grad_norm": 0.5199916958808899,
+      "learning_rate": 1.999090225964059e-05,
+      "loss": 1.4921,
+      "step": 91300
+    },
+    {
+      "epoch": 0.06797257299242931,
+      "grad_norm": 0.5790271162986755,
+      "learning_rate": 1.9990882321331916e-05,
+      "loss": 1.5773,
+      "step": 91400
+    },
+    {
+      "epoch": 0.06804694123421534,
+      "grad_norm": 0.5524342656135559,
+      "learning_rate": 1.9990862361209043e-05,
+      "loss": 1.4619,
+      "step": 91500
+    },
+    {
+      "epoch": 0.06812130947600137,
+      "grad_norm": 0.7153291702270508,
+      "learning_rate": 1.999084237927202e-05,
+      "loss": 1.6042,
+      "step": 91600
+    },
+    {
+      "epoch": 0.0681956777177874,
+      "grad_norm": 0.957635223865509,
+      "learning_rate": 1.9990822375520882e-05,
+      "loss": 1.538,
+      "step": 91700
+    },
+    {
+      "epoch": 0.06827004595957342,
+      "grad_norm": 0.38240477442741394,
+      "learning_rate": 1.9990802349955678e-05,
+      "loss": 1.5937,
+      "step": 91800
+    },
+    {
+      "epoch": 0.06834441420135945,
+      "grad_norm": 0.8961233496665955,
+      "learning_rate": 1.999078230257645e-05,
+      "loss": 1.5119,
+      "step": 91900
+    },
+    {
+      "epoch": 0.06841878244314548,
+      "grad_norm": 0.47433900833129883,
+      "learning_rate": 1.999076223338324e-05,
+      "loss": 1.5449,
+      "step": 92000
+    },
+    {
+      "epoch": 0.0684931506849315,
+      "grad_norm": 0.8222399353981018,
+      "learning_rate": 1.9990742142376098e-05,
+      "loss": 1.5334,
+      "step": 92100
+    },
+    {
+      "epoch": 0.06856751892671753,
+      "grad_norm": 0.464373916387558,
+      "learning_rate": 1.999072202955506e-05,
+      "loss": 1.5003,
+      "step": 92200
+    },
+    {
+      "epoch": 0.06864188716850356,
+      "grad_norm": 0.8799763321876526,
+      "learning_rate": 1.9990701894920176e-05,
+      "loss": 1.581,
+      "step": 92300
+    },
+    {
+      "epoch": 0.06871625541028958,
+      "grad_norm": 0.9567086100578308,
+      "learning_rate": 1.999068173847149e-05,
+      "loss": 1.4373,
+      "step": 92400
+    },
+    {
+      "epoch": 0.06879062365207561,
+      "grad_norm": 0.440479576587677,
+      "learning_rate": 1.999066156020904e-05,
+      "loss": 1.5571,
+      "step": 92500
+    },
+    {
+      "epoch": 0.06886499189386165,
+      "grad_norm": 0.7486180663108826,
+      "learning_rate": 1.9990641360132876e-05,
+      "loss": 1.4437,
+      "step": 92600
+    },
+    {
+      "epoch": 0.06893936013564768,
+      "grad_norm": 0.7576742172241211,
+      "learning_rate": 1.9990621138243037e-05,
+      "loss": 1.5306,
+      "step": 92700
+    },
+    {
+      "epoch": 0.0690137283774337,
+      "grad_norm": 0.6755186915397644,
+      "learning_rate": 1.9990600894539574e-05,
+      "loss": 1.5769,
+      "step": 92800
+    },
+    {
+      "epoch": 0.06908809661921973,
+      "grad_norm": 0.6093853712081909,
+      "learning_rate": 1.9990580629022526e-05,
+      "loss": 1.5777,
+      "step": 92900
+    },
+    {
+      "epoch": 0.06916246486100576,
+      "grad_norm": 0.5788242220878601,
+      "learning_rate": 1.9990560341691938e-05,
+      "loss": 1.494,
+      "step": 93000
+    },
+    {
+      "epoch": 0.06923683310279179,
+      "grad_norm": 0.828676700592041,
+      "learning_rate": 1.9990540032547855e-05,
+      "loss": 1.5651,
+      "step": 93100
+    },
+    {
+      "epoch": 0.06931120134457781,
+      "grad_norm": 0.5612863302230835,
+      "learning_rate": 1.9990519701590322e-05,
+      "loss": 1.5584,
+      "step": 93200
+    },
+    {
+      "epoch": 0.06938556958636384,
+      "grad_norm": 0.965107262134552,
+      "learning_rate": 1.999049934881938e-05,
+      "loss": 1.497,
+      "step": 93300
+    },
+    {
+      "epoch": 0.06945993782814987,
+      "grad_norm": 0.46939852833747864,
+      "learning_rate": 1.9990478974235078e-05,
+      "loss": 1.5716,
+      "step": 93400
+    },
+    {
+      "epoch": 0.0695343060699359,
+      "grad_norm": 0.4986964464187622,
+      "learning_rate": 1.999045857783746e-05,
+      "loss": 1.5762,
+      "step": 93500
+    },
+    {
+      "epoch": 0.06960867431172192,
+      "grad_norm": 0.4267128109931946,
+      "learning_rate": 1.9990438159626566e-05,
+      "loss": 1.5101,
+      "step": 93600
+    },
+    {
+      "epoch": 0.06968304255350795,
+      "grad_norm": 0.411811888217926,
+      "learning_rate": 1.9990417719602445e-05,
+      "loss": 1.5623,
+      "step": 93700
+    },
+    {
+      "epoch": 0.06975741079529398,
+      "grad_norm": 0.8761053681373596,
+      "learning_rate": 1.999039725776514e-05,
+      "loss": 1.4294,
+      "step": 93800
+    },
+    {
+      "epoch": 0.06983177903708,
+      "grad_norm": 0.9531000852584839,
+      "learning_rate": 1.99903767741147e-05,
+      "loss": 1.4925,
+      "step": 93900
+    },
+    {
+      "epoch": 0.06990614727886603,
+      "grad_norm": 0.516830325126648,
+      "learning_rate": 1.999035626865116e-05,
+      "loss": 1.5802,
+      "step": 94000
+    },
+    {
+      "epoch": 0.06998051552065206,
+      "grad_norm": 0.47061294317245483,
+      "learning_rate": 1.9990335741374572e-05,
+      "loss": 1.5668,
+      "step": 94100
+    },
+    {
+      "epoch": 0.07005488376243808,
+      "grad_norm": 0.7790777683258057,
+      "learning_rate": 1.9990315192284978e-05,
+      "loss": 1.5568,
+      "step": 94200
+    },
+    {
+      "epoch": 0.07012925200422411,
+      "grad_norm": 0.75156170129776,
+      "learning_rate": 1.9990294621382426e-05,
+      "loss": 1.5217,
+      "step": 94300
+    },
+    {
+      "epoch": 0.07020362024601014,
+      "grad_norm": 1.195028305053711,
+      "learning_rate": 1.999027402866696e-05,
+      "loss": 1.5662,
+      "step": 94400
+    },
+    {
+      "epoch": 0.07027798848779618,
+      "grad_norm": 0.6215851306915283,
+      "learning_rate": 1.999025341413862e-05,
+      "loss": 1.5208,
+      "step": 94500
+    },
+    {
+      "epoch": 0.0703523567295822,
+      "grad_norm": 0.509843647480011,
+      "learning_rate": 1.9990232777797458e-05,
+      "loss": 1.489,
+      "step": 94600
+    },
+    {
+      "epoch": 0.07042672497136823,
+      "grad_norm": 1.2951029539108276,
+      "learning_rate": 1.9990212119643516e-05,
+      "loss": 1.4729,
+      "step": 94700
+    },
+    {
+      "epoch": 0.07050109321315426,
+      "grad_norm": 0.5028135776519775,
+      "learning_rate": 1.9990191439676838e-05,
+      "loss": 1.5579,
+      "step": 94800
+    },
+    {
+      "epoch": 0.07057546145494029,
+      "grad_norm": 0.7202877998352051,
+      "learning_rate": 1.9990170737897473e-05,
+      "loss": 1.5282,
+      "step": 94900
+    },
+    {
+      "epoch": 0.07064982969672631,
+      "grad_norm": 0.9731516242027283,
+      "learning_rate": 1.9990150014305462e-05,
+      "loss": 1.5194,
+      "step": 95000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 1.2945898144897352e+18,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8bb89e5e4b20648cd50836d8df065bde229d29cb5c6085310a18725c84aab824
-size 5496

 version https://git-lfs.github.com/spec/v1
+oid sha256:ccc6594b62fe53f0b1bfeab5cb36a3d9d52c3d027d521d24a54039f0b55f3bd6
+size 5560