diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..081a93206df6c132ed5797e8a9359be0f99b9852 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:322995eee76b8c0f37eec88c8db677a2dadc58f888a5f41b0dbd241ec82ea055 +size 16794200 diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-1000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c520ed14286999c76c163ebfd0a43c62ac730b84 --- /dev/null +++ b/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61b2c604ead0cf0a1d825a8b61a4096f87be896513c953a8ade9aee0548f1922 +size 16794200 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bbcef724742b1e54eadeb7187c91c561c9d54d2 --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7e36dd6b116bdbd98eafdce3f9bdbacc927e9253396028875a56311f9bfdbe4 +size 33662074 diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f264ec556ff18bae3dceca135659b4830d5cf90d --- /dev/null +++ b/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2706a49fa21b5a44ba570d558ac14d4038598b6128b98b01b3aa7aaaf0db19c5 +size 14244 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..242511911a1091e7211a330ca1097e41178f5631 --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc1314a31cf86530b6d57df23e6c90a09bc1cdb44a55906247fc495679415522 +size 1064 diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..de5d7e6b17bef6a88def07137940bcac77472749 --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,371 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.612065941774816, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + }, + { + "epoch": 2.92, + "grad_norm": 3.929158926010132, + "learning_rate": 1.806367041198502e-05, + "loss": 0.6395, + "step": 520 + }, + { + "epoch": 3.03, + "grad_norm": 1.7316572666168213, + "learning_rate": 1.7988764044943823e-05, + "loss": 0.664, + "step": 540 + }, + { + "epoch": 3.14, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.7913857677902623e-05, + "loss": 0.6469, + "step": 560 + }, + { + "epoch": 3.25, + "grad_norm": 1.5258549451828003, + "learning_rate": 1.7838951310861426e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 3.37, + "grad_norm": 1.5486094951629639, + "learning_rate": 1.7764044943820226e-05, + "loss": 0.566, + "step": 600 + }, + { + "epoch": 3.48, + "grad_norm": 1.5657902956008911, + "learning_rate": 1.768913857677903e-05, + "loss": 0.6166, + "step": 620 + }, + { + "epoch": 3.59, + "grad_norm": 1.5971391201019287, + "learning_rate": 1.761423220973783e-05, + "loss": 0.5973, + "step": 640 + }, + { + "epoch": 3.7, + "grad_norm": 1.333030343055725, + "learning_rate": 1.753932584269663e-05, + "loss": 0.6117, + "step": 660 + }, + { + "epoch": 3.82, + "grad_norm": 1.4425445795059204, + "learning_rate": 1.746441947565543e-05, + "loss": 0.5702, + "step": 680 + }, + { + "epoch": 3.93, + "grad_norm": 1.4773032665252686, + "learning_rate": 1.7389513108614234e-05, + "loss": 0.5465, + "step": 700 + }, + { + "epoch": 4.04, + "grad_norm": 1.3328267335891724, + "learning_rate": 1.7314606741573034e-05, + "loss": 0.5379, + "step": 720 + }, + { + "epoch": 4.15, + "grad_norm": 1.6961455345153809, + "learning_rate": 1.7239700374531837e-05, + "loss": 0.5492, + "step": 740 + }, + { + "epoch": 4.27, + "grad_norm": 1.4636189937591553, + "learning_rate": 1.7164794007490637e-05, + "loss": 0.547, + "step": 760 + }, + { + "epoch": 4.38, + "grad_norm": 2.1686649322509766, + "learning_rate": 1.708988764044944e-05, + "loss": 0.5424, + "step": 780 + }, + { + "epoch": 4.49, + "grad_norm": 1.219388723373413, + "learning_rate": 1.7014981273408243e-05, + "loss": 0.5373, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 1.5566452741622925, + "learning_rate": 1.6940074906367042e-05, + "loss": 0.4944, + "step": 820 + }, + { + "epoch": 4.71, + "grad_norm": 1.598917841911316, + "learning_rate": 1.6865168539325845e-05, + "loss": 0.5036, + "step": 840 + }, + { + "epoch": 4.83, + "grad_norm": 1.5281039476394653, + "learning_rate": 1.6790262172284645e-05, + "loss": 0.5215, + "step": 860 + }, + { + "epoch": 4.94, + "grad_norm": 1.7123130559921265, + "learning_rate": 1.6715355805243448e-05, + "loss": 0.5362, + "step": 880 + }, + { + "epoch": 5.05, + "grad_norm": 1.543447732925415, + "learning_rate": 1.6640449438202248e-05, + "loss": 0.5379, + "step": 900 + }, + { + "epoch": 5.16, + "grad_norm": 2.4190192222595215, + "learning_rate": 1.656554307116105e-05, + "loss": 0.4921, + "step": 920 + }, + { + "epoch": 5.28, + "grad_norm": 2.190906047821045, + "learning_rate": 1.649063670411985e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 5.39, + "grad_norm": 2.113476514816284, + "learning_rate": 1.6415730337078653e-05, + "loss": 0.4914, + "step": 960 + }, + { + "epoch": 5.5, + "grad_norm": 1.8785656690597534, + "learning_rate": 1.6340823970037453e-05, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 5.61, + "grad_norm": 1.3745977878570557, + "learning_rate": 1.6265917602996256e-05, + "loss": 0.4697, + "step": 1000 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 2.5991277871104e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/checkpoint-1500/README.md b/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-1500/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-1500/adapter_config.json b/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-1500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1500/adapter_model.safetensors b/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bb05853b03f7affc0b393101f4bc5e40fd16bf5b --- /dev/null +++ b/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ff31695298b9be3b6fcd71ccf123f331b736ec12d764be1ee04aeeacf3720f2 +size 16794200 diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e69c655636a3b6d70a6d00d3e7bdcf30cb2de62e --- /dev/null +++ b/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:559f0868d58974fef2ee217d4246fb98b4e13020948ac8a45cebf261e9f6b1f5 +size 33662074 diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9940445424322e0d05edd687e27cf205e5cfe06 --- /dev/null +++ b/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f46b1b617ab078d379f02aa6edd73ab2de2a6998f4b34b6d2892f59c64fc3cf +size 14244 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d579adafcfa9f5f608108398321c83d980062061 --- /dev/null +++ b/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52b722be306a1f27bd6fadd28cb76f1979dcd76e35029a0bc5191bb947e1404 +size 1064 diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4446d0b66c7ff6aaf77d3c5e80d3a007c152da1e --- /dev/null +++ b/checkpoint-1500/trainer_state.json @@ -0,0 +1,546 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.418098912662224, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + }, + { + "epoch": 2.92, + "grad_norm": 3.929158926010132, + "learning_rate": 1.806367041198502e-05, + "loss": 0.6395, + "step": 520 + }, + { + "epoch": 3.03, + "grad_norm": 1.7316572666168213, + "learning_rate": 1.7988764044943823e-05, + "loss": 0.664, + "step": 540 + }, + { + "epoch": 3.14, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.7913857677902623e-05, + "loss": 0.6469, + "step": 560 + }, + { + "epoch": 3.25, + "grad_norm": 1.5258549451828003, + "learning_rate": 1.7838951310861426e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 3.37, + "grad_norm": 1.5486094951629639, + "learning_rate": 1.7764044943820226e-05, + "loss": 0.566, + "step": 600 + }, + { + "epoch": 3.48, + "grad_norm": 1.5657902956008911, + "learning_rate": 1.768913857677903e-05, + "loss": 0.6166, + "step": 620 + }, + { + "epoch": 3.59, + "grad_norm": 1.5971391201019287, + "learning_rate": 1.761423220973783e-05, + "loss": 0.5973, + "step": 640 + }, + { + "epoch": 3.7, + "grad_norm": 1.333030343055725, + "learning_rate": 1.753932584269663e-05, + "loss": 0.6117, + "step": 660 + }, + { + "epoch": 3.82, + "grad_norm": 1.4425445795059204, + "learning_rate": 1.746441947565543e-05, + "loss": 0.5702, + "step": 680 + }, + { + "epoch": 3.93, + "grad_norm": 1.4773032665252686, + "learning_rate": 1.7389513108614234e-05, + "loss": 0.5465, + "step": 700 + }, + { + "epoch": 4.04, + "grad_norm": 1.3328267335891724, + "learning_rate": 1.7314606741573034e-05, + "loss": 0.5379, + "step": 720 + }, + { + "epoch": 4.15, + "grad_norm": 1.6961455345153809, + "learning_rate": 1.7239700374531837e-05, + "loss": 0.5492, + "step": 740 + }, + { + "epoch": 4.27, + "grad_norm": 1.4636189937591553, + "learning_rate": 1.7164794007490637e-05, + "loss": 0.547, + "step": 760 + }, + { + "epoch": 4.38, + "grad_norm": 2.1686649322509766, + "learning_rate": 1.708988764044944e-05, + "loss": 0.5424, + "step": 780 + }, + { + "epoch": 4.49, + "grad_norm": 1.219388723373413, + "learning_rate": 1.7014981273408243e-05, + "loss": 0.5373, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 1.5566452741622925, + "learning_rate": 1.6940074906367042e-05, + "loss": 0.4944, + "step": 820 + }, + { + "epoch": 4.71, + "grad_norm": 1.598917841911316, + "learning_rate": 1.6865168539325845e-05, + "loss": 0.5036, + "step": 840 + }, + { + "epoch": 4.83, + "grad_norm": 1.5281039476394653, + "learning_rate": 1.6790262172284645e-05, + "loss": 0.5215, + "step": 860 + }, + { + "epoch": 4.94, + "grad_norm": 1.7123130559921265, + "learning_rate": 1.6715355805243448e-05, + "loss": 0.5362, + "step": 880 + }, + { + "epoch": 5.05, + "grad_norm": 1.543447732925415, + "learning_rate": 1.6640449438202248e-05, + "loss": 0.5379, + "step": 900 + }, + { + "epoch": 5.16, + "grad_norm": 2.4190192222595215, + "learning_rate": 1.656554307116105e-05, + "loss": 0.4921, + "step": 920 + }, + { + "epoch": 5.28, + "grad_norm": 2.190906047821045, + "learning_rate": 1.649063670411985e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 5.39, + "grad_norm": 2.113476514816284, + "learning_rate": 1.6415730337078653e-05, + "loss": 0.4914, + "step": 960 + }, + { + "epoch": 5.5, + "grad_norm": 1.8785656690597534, + "learning_rate": 1.6340823970037453e-05, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 5.61, + "grad_norm": 1.3745977878570557, + "learning_rate": 1.6265917602996256e-05, + "loss": 0.4697, + "step": 1000 + }, + { + "epoch": 5.72, + "grad_norm": 1.7874308824539185, + "learning_rate": 1.6191011235955056e-05, + "loss": 0.4625, + "step": 1020 + }, + { + "epoch": 5.84, + "grad_norm": 1.4448940753936768, + "learning_rate": 1.611610486891386e-05, + "loss": 0.4764, + "step": 1040 + }, + { + "epoch": 5.95, + "grad_norm": 2.278655767440796, + "learning_rate": 1.6041198501872662e-05, + "loss": 0.4221, + "step": 1060 + }, + { + "epoch": 6.06, + "grad_norm": 1.8602409362792969, + "learning_rate": 1.596629213483146e-05, + "loss": 0.4731, + "step": 1080 + }, + { + "epoch": 6.17, + "grad_norm": 1.884373426437378, + "learning_rate": 1.5891385767790265e-05, + "loss": 0.4241, + "step": 1100 + }, + { + "epoch": 6.29, + "grad_norm": 2.0259287357330322, + "learning_rate": 1.5816479400749064e-05, + "loss": 0.4368, + "step": 1120 + }, + { + "epoch": 6.4, + "grad_norm": 1.812462329864502, + "learning_rate": 1.5741573033707867e-05, + "loss": 0.442, + "step": 1140 + }, + { + "epoch": 6.51, + "grad_norm": 1.934327483177185, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.4195, + "step": 1160 + }, + { + "epoch": 6.62, + "grad_norm": 1.6152955293655396, + "learning_rate": 1.559176029962547e-05, + "loss": 0.4374, + "step": 1180 + }, + { + "epoch": 6.73, + "grad_norm": 2.7782068252563477, + "learning_rate": 1.551685393258427e-05, + "loss": 0.4231, + "step": 1200 + }, + { + "epoch": 6.85, + "grad_norm": 2.372976303100586, + "learning_rate": 1.5441947565543073e-05, + "loss": 0.444, + "step": 1220 + }, + { + "epoch": 6.96, + "grad_norm": 2.171353816986084, + "learning_rate": 1.5367041198501872e-05, + "loss": 0.4389, + "step": 1240 + }, + { + "epoch": 7.07, + "grad_norm": 1.3093984127044678, + "learning_rate": 1.5292134831460675e-05, + "loss": 0.4301, + "step": 1260 + }, + { + "epoch": 7.18, + "grad_norm": 2.267932176589966, + "learning_rate": 1.5217228464419478e-05, + "loss": 0.4046, + "step": 1280 + }, + { + "epoch": 7.3, + "grad_norm": 1.5326164960861206, + "learning_rate": 1.514232209737828e-05, + "loss": 0.4068, + "step": 1300 + }, + { + "epoch": 7.41, + "grad_norm": 3.1525979042053223, + "learning_rate": 1.5067415730337081e-05, + "loss": 0.3847, + "step": 1320 + }, + { + "epoch": 7.52, + "grad_norm": 2.081890106201172, + "learning_rate": 1.4992509363295882e-05, + "loss": 0.4126, + "step": 1340 + }, + { + "epoch": 7.63, + "grad_norm": 2.5701358318328857, + "learning_rate": 1.4917602996254684e-05, + "loss": 0.4065, + "step": 1360 + }, + { + "epoch": 7.74, + "grad_norm": 1.4190051555633545, + "learning_rate": 1.4842696629213485e-05, + "loss": 0.3979, + "step": 1380 + }, + { + "epoch": 7.86, + "grad_norm": 1.9085837602615356, + "learning_rate": 1.4767790262172286e-05, + "loss": 0.3894, + "step": 1400 + }, + { + "epoch": 7.97, + "grad_norm": 1.7573003768920898, + "learning_rate": 1.4692883895131088e-05, + "loss": 0.3751, + "step": 1420 + }, + { + "epoch": 8.08, + "grad_norm": 1.8974506855010986, + "learning_rate": 1.4617977528089889e-05, + "loss": 0.3936, + "step": 1440 + }, + { + "epoch": 8.19, + "grad_norm": 1.3843660354614258, + "learning_rate": 1.454307116104869e-05, + "loss": 0.3848, + "step": 1460 + }, + { + "epoch": 8.31, + "grad_norm": 1.525007724761963, + "learning_rate": 1.4468164794007492e-05, + "loss": 0.3552, + "step": 1480 + }, + { + "epoch": 8.42, + "grad_norm": 2.1665101051330566, + "learning_rate": 1.4393258426966291e-05, + "loss": 0.3547, + "step": 1500 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 3.8986916806656e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/checkpoint-2000/README.md b/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-2000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-2000/adapter_config.json b/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-2000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2000/adapter_model.safetensors b/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d5deb80d334f9e94dfe86bf939a15763296c1b8d --- /dev/null +++ b/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd613ed96e17d21d0945f5501544448e1b8642624f98437d52f35d1d4c8e5f8c +size 16794200 diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0aad2fa6e708ed4efb1015ce4f84fe8344f31c0b --- /dev/null +++ b/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:518aaaf2582cfdbc4732e00c710f2caee0326357ff4232a21da19f4c9440f012 +size 33662074 diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f553e49913631679a87d1c090b396b923f78fb8 --- /dev/null +++ b/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1db8ab10bef9f40ccd736ef23349a4646ad25eef39a69b9f918c14bcf25f36ff +size 14244 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d2d1dcb86f6f467686d305faa13f273c5574fa8 --- /dev/null +++ b/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df45828308b99d6f2a907ebbbe6b5adce87512a3daaba9a08e9c034f2286ed55 +size 1064 diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a75036003f51e292e83eb2c7afbeb6e91429b959 --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,721 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.224131883549632, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + }, + { + "epoch": 2.92, + "grad_norm": 3.929158926010132, + "learning_rate": 1.806367041198502e-05, + "loss": 0.6395, + "step": 520 + }, + { + "epoch": 3.03, + "grad_norm": 1.7316572666168213, + "learning_rate": 1.7988764044943823e-05, + "loss": 0.664, + "step": 540 + }, + { + "epoch": 3.14, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.7913857677902623e-05, + "loss": 0.6469, + "step": 560 + }, + { + "epoch": 3.25, + "grad_norm": 1.5258549451828003, + "learning_rate": 1.7838951310861426e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 3.37, + "grad_norm": 1.5486094951629639, + "learning_rate": 1.7764044943820226e-05, + "loss": 0.566, + "step": 600 + }, + { + "epoch": 3.48, + "grad_norm": 1.5657902956008911, + "learning_rate": 1.768913857677903e-05, + "loss": 0.6166, + "step": 620 + }, + { + "epoch": 3.59, + "grad_norm": 1.5971391201019287, + "learning_rate": 1.761423220973783e-05, + "loss": 0.5973, + "step": 640 + }, + { + "epoch": 3.7, + "grad_norm": 1.333030343055725, + "learning_rate": 1.753932584269663e-05, + "loss": 0.6117, + "step": 660 + }, + { + "epoch": 3.82, + "grad_norm": 1.4425445795059204, + "learning_rate": 1.746441947565543e-05, + "loss": 0.5702, + "step": 680 + }, + { + "epoch": 3.93, + "grad_norm": 1.4773032665252686, + "learning_rate": 1.7389513108614234e-05, + "loss": 0.5465, + "step": 700 + }, + { + "epoch": 4.04, + "grad_norm": 1.3328267335891724, + "learning_rate": 1.7314606741573034e-05, + "loss": 0.5379, + "step": 720 + }, + { + "epoch": 4.15, + "grad_norm": 1.6961455345153809, + "learning_rate": 1.7239700374531837e-05, + "loss": 0.5492, + "step": 740 + }, + { + "epoch": 4.27, + "grad_norm": 1.4636189937591553, + "learning_rate": 1.7164794007490637e-05, + "loss": 0.547, + "step": 760 + }, + { + "epoch": 4.38, + "grad_norm": 2.1686649322509766, + "learning_rate": 1.708988764044944e-05, + "loss": 0.5424, + "step": 780 + }, + { + "epoch": 4.49, + "grad_norm": 1.219388723373413, + "learning_rate": 1.7014981273408243e-05, + "loss": 0.5373, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 1.5566452741622925, + "learning_rate": 1.6940074906367042e-05, + "loss": 0.4944, + "step": 820 + }, + { + "epoch": 4.71, + "grad_norm": 1.598917841911316, + "learning_rate": 1.6865168539325845e-05, + "loss": 0.5036, + "step": 840 + }, + { + "epoch": 4.83, + "grad_norm": 1.5281039476394653, + "learning_rate": 1.6790262172284645e-05, + "loss": 0.5215, + "step": 860 + }, + { + "epoch": 4.94, + "grad_norm": 1.7123130559921265, + "learning_rate": 1.6715355805243448e-05, + "loss": 0.5362, + "step": 880 + }, + { + "epoch": 5.05, + "grad_norm": 1.543447732925415, + "learning_rate": 1.6640449438202248e-05, + "loss": 0.5379, + "step": 900 + }, + { + "epoch": 5.16, + "grad_norm": 2.4190192222595215, + "learning_rate": 1.656554307116105e-05, + "loss": 0.4921, + "step": 920 + }, + { + "epoch": 5.28, + "grad_norm": 2.190906047821045, + "learning_rate": 1.649063670411985e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 5.39, + "grad_norm": 2.113476514816284, + "learning_rate": 1.6415730337078653e-05, + "loss": 0.4914, + "step": 960 + }, + { + "epoch": 5.5, + "grad_norm": 1.8785656690597534, + "learning_rate": 1.6340823970037453e-05, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 5.61, + "grad_norm": 1.3745977878570557, + "learning_rate": 1.6265917602996256e-05, + "loss": 0.4697, + "step": 1000 + }, + { + "epoch": 5.72, + "grad_norm": 1.7874308824539185, + "learning_rate": 1.6191011235955056e-05, + "loss": 0.4625, + "step": 1020 + }, + { + "epoch": 5.84, + "grad_norm": 1.4448940753936768, + "learning_rate": 1.611610486891386e-05, + "loss": 0.4764, + "step": 1040 + }, + { + "epoch": 5.95, + "grad_norm": 2.278655767440796, + "learning_rate": 1.6041198501872662e-05, + "loss": 0.4221, + "step": 1060 + }, + { + "epoch": 6.06, + "grad_norm": 1.8602409362792969, + "learning_rate": 1.596629213483146e-05, + "loss": 0.4731, + "step": 1080 + }, + { + "epoch": 6.17, + "grad_norm": 1.884373426437378, + "learning_rate": 1.5891385767790265e-05, + "loss": 0.4241, + "step": 1100 + }, + { + "epoch": 6.29, + "grad_norm": 2.0259287357330322, + "learning_rate": 1.5816479400749064e-05, + "loss": 0.4368, + "step": 1120 + }, + { + "epoch": 6.4, + "grad_norm": 1.812462329864502, + "learning_rate": 1.5741573033707867e-05, + "loss": 0.442, + "step": 1140 + }, + { + "epoch": 6.51, + "grad_norm": 1.934327483177185, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.4195, + "step": 1160 + }, + { + "epoch": 6.62, + "grad_norm": 1.6152955293655396, + "learning_rate": 1.559176029962547e-05, + "loss": 0.4374, + "step": 1180 + }, + { + "epoch": 6.73, + "grad_norm": 2.7782068252563477, + "learning_rate": 1.551685393258427e-05, + "loss": 0.4231, + "step": 1200 + }, + { + "epoch": 6.85, + "grad_norm": 2.372976303100586, + "learning_rate": 1.5441947565543073e-05, + "loss": 0.444, + "step": 1220 + }, + { + "epoch": 6.96, + "grad_norm": 2.171353816986084, + "learning_rate": 1.5367041198501872e-05, + "loss": 0.4389, + "step": 1240 + }, + { + "epoch": 7.07, + "grad_norm": 1.3093984127044678, + "learning_rate": 1.5292134831460675e-05, + "loss": 0.4301, + "step": 1260 + }, + { + "epoch": 7.18, + "grad_norm": 2.267932176589966, + "learning_rate": 1.5217228464419478e-05, + "loss": 0.4046, + "step": 1280 + }, + { + "epoch": 7.3, + "grad_norm": 1.5326164960861206, + "learning_rate": 1.514232209737828e-05, + "loss": 0.4068, + "step": 1300 + }, + { + "epoch": 7.41, + "grad_norm": 3.1525979042053223, + "learning_rate": 1.5067415730337081e-05, + "loss": 0.3847, + "step": 1320 + }, + { + "epoch": 7.52, + "grad_norm": 2.081890106201172, + "learning_rate": 1.4992509363295882e-05, + "loss": 0.4126, + "step": 1340 + }, + { + "epoch": 7.63, + "grad_norm": 2.5701358318328857, + "learning_rate": 1.4917602996254684e-05, + "loss": 0.4065, + "step": 1360 + }, + { + "epoch": 7.74, + "grad_norm": 1.4190051555633545, + "learning_rate": 1.4842696629213485e-05, + "loss": 0.3979, + "step": 1380 + }, + { + "epoch": 7.86, + "grad_norm": 1.9085837602615356, + "learning_rate": 1.4767790262172286e-05, + "loss": 0.3894, + "step": 1400 + }, + { + "epoch": 7.97, + "grad_norm": 1.7573003768920898, + "learning_rate": 1.4692883895131088e-05, + "loss": 0.3751, + "step": 1420 + }, + { + "epoch": 8.08, + "grad_norm": 1.8974506855010986, + "learning_rate": 1.4617977528089889e-05, + "loss": 0.3936, + "step": 1440 + }, + { + "epoch": 8.19, + "grad_norm": 1.3843660354614258, + "learning_rate": 1.454307116104869e-05, + "loss": 0.3848, + "step": 1460 + }, + { + "epoch": 8.31, + "grad_norm": 1.525007724761963, + "learning_rate": 1.4468164794007492e-05, + "loss": 0.3552, + "step": 1480 + }, + { + "epoch": 8.42, + "grad_norm": 2.1665101051330566, + "learning_rate": 1.4393258426966291e-05, + "loss": 0.3547, + "step": 1500 + }, + { + "epoch": 8.53, + "grad_norm": 3.3614535331726074, + "learning_rate": 1.4318352059925096e-05, + "loss": 0.3771, + "step": 1520 + }, + { + "epoch": 8.64, + "grad_norm": 1.746299386024475, + "learning_rate": 1.4243445692883898e-05, + "loss": 0.396, + "step": 1540 + }, + { + "epoch": 8.75, + "grad_norm": 1.9144684076309204, + "learning_rate": 1.4168539325842699e-05, + "loss": 0.3748, + "step": 1560 + }, + { + "epoch": 8.87, + "grad_norm": 1.9617277383804321, + "learning_rate": 1.40936329588015e-05, + "loss": 0.3504, + "step": 1580 + }, + { + "epoch": 8.98, + "grad_norm": 2.69067645072937, + "learning_rate": 1.4018726591760302e-05, + "loss": 0.3477, + "step": 1600 + }, + { + "epoch": 9.09, + "grad_norm": 2.142008066177368, + "learning_rate": 1.3943820224719103e-05, + "loss": 0.3539, + "step": 1620 + }, + { + "epoch": 9.2, + "grad_norm": 1.7684266567230225, + "learning_rate": 1.3868913857677904e-05, + "loss": 0.3576, + "step": 1640 + }, + { + "epoch": 9.32, + "grad_norm": 1.4222275018692017, + "learning_rate": 1.3794007490636706e-05, + "loss": 0.3839, + "step": 1660 + }, + { + "epoch": 9.43, + "grad_norm": 2.0622501373291016, + "learning_rate": 1.3719101123595507e-05, + "loss": 0.3278, + "step": 1680 + }, + { + "epoch": 9.54, + "grad_norm": 1.639147400856018, + "learning_rate": 1.3644194756554308e-05, + "loss": 0.3374, + "step": 1700 + }, + { + "epoch": 9.65, + "grad_norm": 2.093045473098755, + "learning_rate": 1.356928838951311e-05, + "loss": 0.3535, + "step": 1720 + }, + { + "epoch": 9.76, + "grad_norm": 1.3492937088012695, + "learning_rate": 1.3494382022471911e-05, + "loss": 0.3105, + "step": 1740 + }, + { + "epoch": 9.88, + "grad_norm": 1.585205316543579, + "learning_rate": 1.3419475655430714e-05, + "loss": 0.3181, + "step": 1760 + }, + { + "epoch": 9.99, + "grad_norm": 2.8895344734191895, + "learning_rate": 1.3344569288389515e-05, + "loss": 0.3473, + "step": 1780 + }, + { + "epoch": 10.1, + "grad_norm": 1.7224748134613037, + "learning_rate": 1.3269662921348317e-05, + "loss": 0.3524, + "step": 1800 + }, + { + "epoch": 10.21, + "grad_norm": 2.1029868125915527, + "learning_rate": 1.3194756554307118e-05, + "loss": 0.3408, + "step": 1820 + }, + { + "epoch": 10.33, + "grad_norm": 2.434016227722168, + "learning_rate": 1.311985018726592e-05, + "loss": 0.3266, + "step": 1840 + }, + { + "epoch": 10.44, + "grad_norm": 1.953553318977356, + "learning_rate": 1.304494382022472e-05, + "loss": 0.2844, + "step": 1860 + }, + { + "epoch": 10.55, + "grad_norm": 2.5946218967437744, + "learning_rate": 1.2970037453183522e-05, + "loss": 0.3225, + "step": 1880 + }, + { + "epoch": 10.66, + "grad_norm": 2.5305733680725098, + "learning_rate": 1.2895131086142323e-05, + "loss": 0.3183, + "step": 1900 + }, + { + "epoch": 10.78, + "grad_norm": 3.56726336479187, + "learning_rate": 1.2820224719101125e-05, + "loss": 0.2944, + "step": 1920 + }, + { + "epoch": 10.89, + "grad_norm": 1.9687740802764893, + "learning_rate": 1.2745318352059926e-05, + "loss": 0.3411, + "step": 1940 + }, + { + "epoch": 11.0, + "grad_norm": 1.6027730703353882, + "learning_rate": 1.2670411985018727e-05, + "loss": 0.2949, + "step": 1960 + }, + { + "epoch": 11.11, + "grad_norm": 1.8739397525787354, + "learning_rate": 1.2595505617977529e-05, + "loss": 0.2716, + "step": 1980 + }, + { + "epoch": 11.22, + "grad_norm": 1.6741198301315308, + "learning_rate": 1.2520599250936332e-05, + "loss": 0.3334, + "step": 2000 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 5.1982555742208e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/checkpoint-2500/README.md b/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-2500/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-2500/adapter_config.json b/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-2500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2500/adapter_model.safetensors b/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e2c6be01a304f8b3efac6aada10ec350ea789939 --- /dev/null +++ b/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9005fc8855ef82e0939ba2c5e9d394a3798e9e2a90631842d83fd2f682115df5 +size 16794200 diff --git a/checkpoint-2500/optimizer.pt b/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..04640ec216388ae902c4bd310bfc57e6dcf090d6 --- /dev/null +++ b/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4995116a25cb7af3c66a7498845928a2a695730f9181770f442a671859729a4 +size 33662074 diff --git a/checkpoint-2500/rng_state.pth b/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f21fa710a015e0c651122ced42fb2a779ff63be2 --- /dev/null +++ b/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50829e25f3322bf5790ba82dc96c90b4d645abb278258bff05672cd8b4dbdbad +size 14244 diff --git a/checkpoint-2500/scheduler.pt b/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..caff93b1258008b0a62cfc2dcc7c2fb22112c5a2 --- /dev/null +++ b/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:637026167e8df72379bd688d232a13472d5bffb23ec7620f9c608d613fc966b9 +size 1064 diff --git a/checkpoint-2500/trainer_state.json b/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eeaef273e4238457a45b4609447a8f676c65a000 --- /dev/null +++ b/checkpoint-2500/trainer_state.json @@ -0,0 +1,896 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.03016485443704, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + }, + { + "epoch": 2.92, + "grad_norm": 3.929158926010132, + "learning_rate": 1.806367041198502e-05, + "loss": 0.6395, + "step": 520 + }, + { + "epoch": 3.03, + "grad_norm": 1.7316572666168213, + "learning_rate": 1.7988764044943823e-05, + "loss": 0.664, + "step": 540 + }, + { + "epoch": 3.14, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.7913857677902623e-05, + "loss": 0.6469, + "step": 560 + }, + { + "epoch": 3.25, + "grad_norm": 1.5258549451828003, + "learning_rate": 1.7838951310861426e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 3.37, + "grad_norm": 1.5486094951629639, + "learning_rate": 1.7764044943820226e-05, + "loss": 0.566, + "step": 600 + }, + { + "epoch": 3.48, + "grad_norm": 1.5657902956008911, + "learning_rate": 1.768913857677903e-05, + "loss": 0.6166, + "step": 620 + }, + { + "epoch": 3.59, + "grad_norm": 1.5971391201019287, + "learning_rate": 1.761423220973783e-05, + "loss": 0.5973, + "step": 640 + }, + { + "epoch": 3.7, + "grad_norm": 1.333030343055725, + "learning_rate": 1.753932584269663e-05, + "loss": 0.6117, + "step": 660 + }, + { + "epoch": 3.82, + "grad_norm": 1.4425445795059204, + "learning_rate": 1.746441947565543e-05, + "loss": 0.5702, + "step": 680 + }, + { + "epoch": 3.93, + "grad_norm": 1.4773032665252686, + "learning_rate": 1.7389513108614234e-05, + "loss": 0.5465, + "step": 700 + }, + { + "epoch": 4.04, + "grad_norm": 1.3328267335891724, + "learning_rate": 1.7314606741573034e-05, + "loss": 0.5379, + "step": 720 + }, + { + "epoch": 4.15, + "grad_norm": 1.6961455345153809, + "learning_rate": 1.7239700374531837e-05, + "loss": 0.5492, + "step": 740 + }, + { + "epoch": 4.27, + "grad_norm": 1.4636189937591553, + "learning_rate": 1.7164794007490637e-05, + "loss": 0.547, + "step": 760 + }, + { + "epoch": 4.38, + "grad_norm": 2.1686649322509766, + "learning_rate": 1.708988764044944e-05, + "loss": 0.5424, + "step": 780 + }, + { + "epoch": 4.49, + "grad_norm": 1.219388723373413, + "learning_rate": 1.7014981273408243e-05, + "loss": 0.5373, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 1.5566452741622925, + "learning_rate": 1.6940074906367042e-05, + "loss": 0.4944, + "step": 820 + }, + { + "epoch": 4.71, + "grad_norm": 1.598917841911316, + "learning_rate": 1.6865168539325845e-05, + "loss": 0.5036, + "step": 840 + }, + { + "epoch": 4.83, + "grad_norm": 1.5281039476394653, + "learning_rate": 1.6790262172284645e-05, + "loss": 0.5215, + "step": 860 + }, + { + "epoch": 4.94, + "grad_norm": 1.7123130559921265, + "learning_rate": 1.6715355805243448e-05, + "loss": 0.5362, + "step": 880 + }, + { + "epoch": 5.05, + "grad_norm": 1.543447732925415, + "learning_rate": 1.6640449438202248e-05, + "loss": 0.5379, + "step": 900 + }, + { + "epoch": 5.16, + "grad_norm": 2.4190192222595215, + "learning_rate": 1.656554307116105e-05, + "loss": 0.4921, + "step": 920 + }, + { + "epoch": 5.28, + "grad_norm": 2.190906047821045, + "learning_rate": 1.649063670411985e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 5.39, + "grad_norm": 2.113476514816284, + "learning_rate": 1.6415730337078653e-05, + "loss": 0.4914, + "step": 960 + }, + { + "epoch": 5.5, + "grad_norm": 1.8785656690597534, + "learning_rate": 1.6340823970037453e-05, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 5.61, + "grad_norm": 1.3745977878570557, + "learning_rate": 1.6265917602996256e-05, + "loss": 0.4697, + "step": 1000 + }, + { + "epoch": 5.72, + "grad_norm": 1.7874308824539185, + "learning_rate": 1.6191011235955056e-05, + "loss": 0.4625, + "step": 1020 + }, + { + "epoch": 5.84, + "grad_norm": 1.4448940753936768, + "learning_rate": 1.611610486891386e-05, + "loss": 0.4764, + "step": 1040 + }, + { + "epoch": 5.95, + "grad_norm": 2.278655767440796, + "learning_rate": 1.6041198501872662e-05, + "loss": 0.4221, + "step": 1060 + }, + { + "epoch": 6.06, + "grad_norm": 1.8602409362792969, + "learning_rate": 1.596629213483146e-05, + "loss": 0.4731, + "step": 1080 + }, + { + "epoch": 6.17, + "grad_norm": 1.884373426437378, + "learning_rate": 1.5891385767790265e-05, + "loss": 0.4241, + "step": 1100 + }, + { + "epoch": 6.29, + "grad_norm": 2.0259287357330322, + "learning_rate": 1.5816479400749064e-05, + "loss": 0.4368, + "step": 1120 + }, + { + "epoch": 6.4, + "grad_norm": 1.812462329864502, + "learning_rate": 1.5741573033707867e-05, + "loss": 0.442, + "step": 1140 + }, + { + "epoch": 6.51, + "grad_norm": 1.934327483177185, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.4195, + "step": 1160 + }, + { + "epoch": 6.62, + "grad_norm": 1.6152955293655396, + "learning_rate": 1.559176029962547e-05, + "loss": 0.4374, + "step": 1180 + }, + { + "epoch": 6.73, + "grad_norm": 2.7782068252563477, + "learning_rate": 1.551685393258427e-05, + "loss": 0.4231, + "step": 1200 + }, + { + "epoch": 6.85, + "grad_norm": 2.372976303100586, + "learning_rate": 1.5441947565543073e-05, + "loss": 0.444, + "step": 1220 + }, + { + "epoch": 6.96, + "grad_norm": 2.171353816986084, + "learning_rate": 1.5367041198501872e-05, + "loss": 0.4389, + "step": 1240 + }, + { + "epoch": 7.07, + "grad_norm": 1.3093984127044678, + "learning_rate": 1.5292134831460675e-05, + "loss": 0.4301, + "step": 1260 + }, + { + "epoch": 7.18, + "grad_norm": 2.267932176589966, + "learning_rate": 1.5217228464419478e-05, + "loss": 0.4046, + "step": 1280 + }, + { + "epoch": 7.3, + "grad_norm": 1.5326164960861206, + "learning_rate": 1.514232209737828e-05, + "loss": 0.4068, + "step": 1300 + }, + { + "epoch": 7.41, + "grad_norm": 3.1525979042053223, + "learning_rate": 1.5067415730337081e-05, + "loss": 0.3847, + "step": 1320 + }, + { + "epoch": 7.52, + "grad_norm": 2.081890106201172, + "learning_rate": 1.4992509363295882e-05, + "loss": 0.4126, + "step": 1340 + }, + { + "epoch": 7.63, + "grad_norm": 2.5701358318328857, + "learning_rate": 1.4917602996254684e-05, + "loss": 0.4065, + "step": 1360 + }, + { + "epoch": 7.74, + "grad_norm": 1.4190051555633545, + "learning_rate": 1.4842696629213485e-05, + "loss": 0.3979, + "step": 1380 + }, + { + "epoch": 7.86, + "grad_norm": 1.9085837602615356, + "learning_rate": 1.4767790262172286e-05, + "loss": 0.3894, + "step": 1400 + }, + { + "epoch": 7.97, + "grad_norm": 1.7573003768920898, + "learning_rate": 1.4692883895131088e-05, + "loss": 0.3751, + "step": 1420 + }, + { + "epoch": 8.08, + "grad_norm": 1.8974506855010986, + "learning_rate": 1.4617977528089889e-05, + "loss": 0.3936, + "step": 1440 + }, + { + "epoch": 8.19, + "grad_norm": 1.3843660354614258, + "learning_rate": 1.454307116104869e-05, + "loss": 0.3848, + "step": 1460 + }, + { + "epoch": 8.31, + "grad_norm": 1.525007724761963, + "learning_rate": 1.4468164794007492e-05, + "loss": 0.3552, + "step": 1480 + }, + { + "epoch": 8.42, + "grad_norm": 2.1665101051330566, + "learning_rate": 1.4393258426966291e-05, + "loss": 0.3547, + "step": 1500 + }, + { + "epoch": 8.53, + "grad_norm": 3.3614535331726074, + "learning_rate": 1.4318352059925096e-05, + "loss": 0.3771, + "step": 1520 + }, + { + "epoch": 8.64, + "grad_norm": 1.746299386024475, + "learning_rate": 1.4243445692883898e-05, + "loss": 0.396, + "step": 1540 + }, + { + "epoch": 8.75, + "grad_norm": 1.9144684076309204, + "learning_rate": 1.4168539325842699e-05, + "loss": 0.3748, + "step": 1560 + }, + { + "epoch": 8.87, + "grad_norm": 1.9617277383804321, + "learning_rate": 1.40936329588015e-05, + "loss": 0.3504, + "step": 1580 + }, + { + "epoch": 8.98, + "grad_norm": 2.69067645072937, + "learning_rate": 1.4018726591760302e-05, + "loss": 0.3477, + "step": 1600 + }, + { + "epoch": 9.09, + "grad_norm": 2.142008066177368, + "learning_rate": 1.3943820224719103e-05, + "loss": 0.3539, + "step": 1620 + }, + { + "epoch": 9.2, + "grad_norm": 1.7684266567230225, + "learning_rate": 1.3868913857677904e-05, + "loss": 0.3576, + "step": 1640 + }, + { + "epoch": 9.32, + "grad_norm": 1.4222275018692017, + "learning_rate": 1.3794007490636706e-05, + "loss": 0.3839, + "step": 1660 + }, + { + "epoch": 9.43, + "grad_norm": 2.0622501373291016, + "learning_rate": 1.3719101123595507e-05, + "loss": 0.3278, + "step": 1680 + }, + { + "epoch": 9.54, + "grad_norm": 1.639147400856018, + "learning_rate": 1.3644194756554308e-05, + "loss": 0.3374, + "step": 1700 + }, + { + "epoch": 9.65, + "grad_norm": 2.093045473098755, + "learning_rate": 1.356928838951311e-05, + "loss": 0.3535, + "step": 1720 + }, + { + "epoch": 9.76, + "grad_norm": 1.3492937088012695, + "learning_rate": 1.3494382022471911e-05, + "loss": 0.3105, + "step": 1740 + }, + { + "epoch": 9.88, + "grad_norm": 1.585205316543579, + "learning_rate": 1.3419475655430714e-05, + "loss": 0.3181, + "step": 1760 + }, + { + "epoch": 9.99, + "grad_norm": 2.8895344734191895, + "learning_rate": 1.3344569288389515e-05, + "loss": 0.3473, + "step": 1780 + }, + { + "epoch": 10.1, + "grad_norm": 1.7224748134613037, + "learning_rate": 1.3269662921348317e-05, + "loss": 0.3524, + "step": 1800 + }, + { + "epoch": 10.21, + "grad_norm": 2.1029868125915527, + "learning_rate": 1.3194756554307118e-05, + "loss": 0.3408, + "step": 1820 + }, + { + "epoch": 10.33, + "grad_norm": 2.434016227722168, + "learning_rate": 1.311985018726592e-05, + "loss": 0.3266, + "step": 1840 + }, + { + "epoch": 10.44, + "grad_norm": 1.953553318977356, + "learning_rate": 1.304494382022472e-05, + "loss": 0.2844, + "step": 1860 + }, + { + "epoch": 10.55, + "grad_norm": 2.5946218967437744, + "learning_rate": 1.2970037453183522e-05, + "loss": 0.3225, + "step": 1880 + }, + { + "epoch": 10.66, + "grad_norm": 2.5305733680725098, + "learning_rate": 1.2895131086142323e-05, + "loss": 0.3183, + "step": 1900 + }, + { + "epoch": 10.78, + "grad_norm": 3.56726336479187, + "learning_rate": 1.2820224719101125e-05, + "loss": 0.2944, + "step": 1920 + }, + { + "epoch": 10.89, + "grad_norm": 1.9687740802764893, + "learning_rate": 1.2745318352059926e-05, + "loss": 0.3411, + "step": 1940 + }, + { + "epoch": 11.0, + "grad_norm": 1.6027730703353882, + "learning_rate": 1.2670411985018727e-05, + "loss": 0.2949, + "step": 1960 + }, + { + "epoch": 11.11, + "grad_norm": 1.8739397525787354, + "learning_rate": 1.2595505617977529e-05, + "loss": 0.2716, + "step": 1980 + }, + { + "epoch": 11.22, + "grad_norm": 1.6741198301315308, + "learning_rate": 1.2520599250936332e-05, + "loss": 0.3334, + "step": 2000 + }, + { + "epoch": 11.34, + "grad_norm": 1.950945496559143, + "learning_rate": 1.2445692883895133e-05, + "loss": 0.3291, + "step": 2020 + }, + { + "epoch": 11.45, + "grad_norm": 1.9362170696258545, + "learning_rate": 1.2370786516853935e-05, + "loss": 0.2716, + "step": 2040 + }, + { + "epoch": 11.56, + "grad_norm": 1.6201746463775635, + "learning_rate": 1.2295880149812736e-05, + "loss": 0.2893, + "step": 2060 + }, + { + "epoch": 11.67, + "grad_norm": 3.488088607788086, + "learning_rate": 1.2220973782771537e-05, + "loss": 0.3239, + "step": 2080 + }, + { + "epoch": 11.79, + "grad_norm": 2.4608683586120605, + "learning_rate": 1.2146067415730339e-05, + "loss": 0.271, + "step": 2100 + }, + { + "epoch": 11.9, + "grad_norm": 1.5321098566055298, + "learning_rate": 1.207116104868914e-05, + "loss": 0.2876, + "step": 2120 + }, + { + "epoch": 12.01, + "grad_norm": 1.8334771394729614, + "learning_rate": 1.1996254681647941e-05, + "loss": 0.3066, + "step": 2140 + }, + { + "epoch": 12.12, + "grad_norm": 1.9506254196166992, + "learning_rate": 1.1921348314606743e-05, + "loss": 0.3023, + "step": 2160 + }, + { + "epoch": 12.23, + "grad_norm": 2.9073598384857178, + "learning_rate": 1.1846441947565544e-05, + "loss": 0.3152, + "step": 2180 + }, + { + "epoch": 12.35, + "grad_norm": 1.6023261547088623, + "learning_rate": 1.1771535580524345e-05, + "loss": 0.248, + "step": 2200 + }, + { + "epoch": 12.46, + "grad_norm": 1.7954633235931396, + "learning_rate": 1.1696629213483147e-05, + "loss": 0.2666, + "step": 2220 + }, + { + "epoch": 12.57, + "grad_norm": 2.0331828594207764, + "learning_rate": 1.162172284644195e-05, + "loss": 0.2878, + "step": 2240 + }, + { + "epoch": 12.68, + "grad_norm": 1.656420350074768, + "learning_rate": 1.1546816479400751e-05, + "loss": 0.2805, + "step": 2260 + }, + { + "epoch": 12.8, + "grad_norm": 1.5245873928070068, + "learning_rate": 1.1471910112359552e-05, + "loss": 0.2792, + "step": 2280 + }, + { + "epoch": 12.91, + "grad_norm": 2.6713974475860596, + "learning_rate": 1.1397003745318354e-05, + "loss": 0.2841, + "step": 2300 + }, + { + "epoch": 13.02, + "grad_norm": 1.268479347229004, + "learning_rate": 1.1322097378277155e-05, + "loss": 0.2708, + "step": 2320 + }, + { + "epoch": 13.13, + "grad_norm": 2.2990434169769287, + "learning_rate": 1.1247191011235956e-05, + "loss": 0.2649, + "step": 2340 + }, + { + "epoch": 13.24, + "grad_norm": 2.351956367492676, + "learning_rate": 1.1172284644194758e-05, + "loss": 0.281, + "step": 2360 + }, + { + "epoch": 13.36, + "grad_norm": 1.796783208847046, + "learning_rate": 1.1097378277153559e-05, + "loss": 0.2725, + "step": 2380 + }, + { + "epoch": 13.47, + "grad_norm": 1.7035847902297974, + "learning_rate": 1.102247191011236e-05, + "loss": 0.2799, + "step": 2400 + }, + { + "epoch": 13.58, + "grad_norm": 2.0395431518554688, + "learning_rate": 1.0947565543071162e-05, + "loss": 0.239, + "step": 2420 + }, + { + "epoch": 13.69, + "grad_norm": 1.8008232116699219, + "learning_rate": 1.0872659176029963e-05, + "loss": 0.2553, + "step": 2440 + }, + { + "epoch": 13.81, + "grad_norm": 2.0559043884277344, + "learning_rate": 1.0797752808988765e-05, + "loss": 0.2464, + "step": 2460 + }, + { + "epoch": 13.92, + "grad_norm": 1.8673292398452759, + "learning_rate": 1.0722846441947568e-05, + "loss": 0.2699, + "step": 2480 + }, + { + "epoch": 14.03, + "grad_norm": 1.6819398403167725, + "learning_rate": 1.0647940074906369e-05, + "loss": 0.2566, + "step": 2500 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 6.497819467776e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2500/training_args.bin b/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/checkpoint-3000/README.md b/checkpoint-3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-3000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-3000/adapter_config.json b/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-3000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3000/adapter_model.safetensors b/checkpoint-3000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4261a2d455b9ddb764a3951229c2e68faf5cff6c --- /dev/null +++ b/checkpoint-3000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed1867fa64146b7b1ce7f756adf3b65e3d5adcdd6664bd820c5c6e46b350d624 +size 16794200 diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ccf48edfdba819137dd6d71a59b529415cd32ea --- /dev/null +++ b/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:102c476762ab67e211043125d773688e484fe06593bef2ddd08f32903d6f8ae7 +size 33662074 diff --git a/checkpoint-3000/rng_state.pth b/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b46ec14b45637dda7787284f50732ca86ac0ba7e --- /dev/null +++ b/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be1ea5df0fc5af5b70e3dd590bb085c9796985015fca4adc95bc5206f3cc904 +size 14244 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..61f74ef52621c2c29af0ccff8273eacf52d332bd --- /dev/null +++ b/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd29ac10819a273a981a5d43f9590bffbc82cfcc94b53e85ed43cb1e875328c1 +size 1064 diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..761d9ae4600728a8eaf80180dcda13ea1e731f20 --- /dev/null +++ b/checkpoint-3000/trainer_state.json @@ -0,0 +1,1071 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 16.836197825324447, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + }, + { + "epoch": 2.92, + "grad_norm": 3.929158926010132, + "learning_rate": 1.806367041198502e-05, + "loss": 0.6395, + "step": 520 + }, + { + "epoch": 3.03, + "grad_norm": 1.7316572666168213, + "learning_rate": 1.7988764044943823e-05, + "loss": 0.664, + "step": 540 + }, + { + "epoch": 3.14, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.7913857677902623e-05, + "loss": 0.6469, + "step": 560 + }, + { + "epoch": 3.25, + "grad_norm": 1.5258549451828003, + "learning_rate": 1.7838951310861426e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 3.37, + "grad_norm": 1.5486094951629639, + "learning_rate": 1.7764044943820226e-05, + "loss": 0.566, + "step": 600 + }, + { + "epoch": 3.48, + "grad_norm": 1.5657902956008911, + "learning_rate": 1.768913857677903e-05, + "loss": 0.6166, + "step": 620 + }, + { + "epoch": 3.59, + "grad_norm": 1.5971391201019287, + "learning_rate": 1.761423220973783e-05, + "loss": 0.5973, + "step": 640 + }, + { + "epoch": 3.7, + "grad_norm": 1.333030343055725, + "learning_rate": 1.753932584269663e-05, + "loss": 0.6117, + "step": 660 + }, + { + "epoch": 3.82, + "grad_norm": 1.4425445795059204, + "learning_rate": 1.746441947565543e-05, + "loss": 0.5702, + "step": 680 + }, + { + "epoch": 3.93, + "grad_norm": 1.4773032665252686, + "learning_rate": 1.7389513108614234e-05, + "loss": 0.5465, + "step": 700 + }, + { + "epoch": 4.04, + "grad_norm": 1.3328267335891724, + "learning_rate": 1.7314606741573034e-05, + "loss": 0.5379, + "step": 720 + }, + { + "epoch": 4.15, + "grad_norm": 1.6961455345153809, + "learning_rate": 1.7239700374531837e-05, + "loss": 0.5492, + "step": 740 + }, + { + "epoch": 4.27, + "grad_norm": 1.4636189937591553, + "learning_rate": 1.7164794007490637e-05, + "loss": 0.547, + "step": 760 + }, + { + "epoch": 4.38, + "grad_norm": 2.1686649322509766, + "learning_rate": 1.708988764044944e-05, + "loss": 0.5424, + "step": 780 + }, + { + "epoch": 4.49, + "grad_norm": 1.219388723373413, + "learning_rate": 1.7014981273408243e-05, + "loss": 0.5373, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 1.5566452741622925, + "learning_rate": 1.6940074906367042e-05, + "loss": 0.4944, + "step": 820 + }, + { + "epoch": 4.71, + "grad_norm": 1.598917841911316, + "learning_rate": 1.6865168539325845e-05, + "loss": 0.5036, + "step": 840 + }, + { + "epoch": 4.83, + "grad_norm": 1.5281039476394653, + "learning_rate": 1.6790262172284645e-05, + "loss": 0.5215, + "step": 860 + }, + { + "epoch": 4.94, + "grad_norm": 1.7123130559921265, + "learning_rate": 1.6715355805243448e-05, + "loss": 0.5362, + "step": 880 + }, + { + "epoch": 5.05, + "grad_norm": 1.543447732925415, + "learning_rate": 1.6640449438202248e-05, + "loss": 0.5379, + "step": 900 + }, + { + "epoch": 5.16, + "grad_norm": 2.4190192222595215, + "learning_rate": 1.656554307116105e-05, + "loss": 0.4921, + "step": 920 + }, + { + "epoch": 5.28, + "grad_norm": 2.190906047821045, + "learning_rate": 1.649063670411985e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 5.39, + "grad_norm": 2.113476514816284, + "learning_rate": 1.6415730337078653e-05, + "loss": 0.4914, + "step": 960 + }, + { + "epoch": 5.5, + "grad_norm": 1.8785656690597534, + "learning_rate": 1.6340823970037453e-05, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 5.61, + "grad_norm": 1.3745977878570557, + "learning_rate": 1.6265917602996256e-05, + "loss": 0.4697, + "step": 1000 + }, + { + "epoch": 5.72, + "grad_norm": 1.7874308824539185, + "learning_rate": 1.6191011235955056e-05, + "loss": 0.4625, + "step": 1020 + }, + { + "epoch": 5.84, + "grad_norm": 1.4448940753936768, + "learning_rate": 1.611610486891386e-05, + "loss": 0.4764, + "step": 1040 + }, + { + "epoch": 5.95, + "grad_norm": 2.278655767440796, + "learning_rate": 1.6041198501872662e-05, + "loss": 0.4221, + "step": 1060 + }, + { + "epoch": 6.06, + "grad_norm": 1.8602409362792969, + "learning_rate": 1.596629213483146e-05, + "loss": 0.4731, + "step": 1080 + }, + { + "epoch": 6.17, + "grad_norm": 1.884373426437378, + "learning_rate": 1.5891385767790265e-05, + "loss": 0.4241, + "step": 1100 + }, + { + "epoch": 6.29, + "grad_norm": 2.0259287357330322, + "learning_rate": 1.5816479400749064e-05, + "loss": 0.4368, + "step": 1120 + }, + { + "epoch": 6.4, + "grad_norm": 1.812462329864502, + "learning_rate": 1.5741573033707867e-05, + "loss": 0.442, + "step": 1140 + }, + { + "epoch": 6.51, + "grad_norm": 1.934327483177185, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.4195, + "step": 1160 + }, + { + "epoch": 6.62, + "grad_norm": 1.6152955293655396, + "learning_rate": 1.559176029962547e-05, + "loss": 0.4374, + "step": 1180 + }, + { + "epoch": 6.73, + "grad_norm": 2.7782068252563477, + "learning_rate": 1.551685393258427e-05, + "loss": 0.4231, + "step": 1200 + }, + { + "epoch": 6.85, + "grad_norm": 2.372976303100586, + "learning_rate": 1.5441947565543073e-05, + "loss": 0.444, + "step": 1220 + }, + { + "epoch": 6.96, + "grad_norm": 2.171353816986084, + "learning_rate": 1.5367041198501872e-05, + "loss": 0.4389, + "step": 1240 + }, + { + "epoch": 7.07, + "grad_norm": 1.3093984127044678, + "learning_rate": 1.5292134831460675e-05, + "loss": 0.4301, + "step": 1260 + }, + { + "epoch": 7.18, + "grad_norm": 2.267932176589966, + "learning_rate": 1.5217228464419478e-05, + "loss": 0.4046, + "step": 1280 + }, + { + "epoch": 7.3, + "grad_norm": 1.5326164960861206, + "learning_rate": 1.514232209737828e-05, + "loss": 0.4068, + "step": 1300 + }, + { + "epoch": 7.41, + "grad_norm": 3.1525979042053223, + "learning_rate": 1.5067415730337081e-05, + "loss": 0.3847, + "step": 1320 + }, + { + "epoch": 7.52, + "grad_norm": 2.081890106201172, + "learning_rate": 1.4992509363295882e-05, + "loss": 0.4126, + "step": 1340 + }, + { + "epoch": 7.63, + "grad_norm": 2.5701358318328857, + "learning_rate": 1.4917602996254684e-05, + "loss": 0.4065, + "step": 1360 + }, + { + "epoch": 7.74, + "grad_norm": 1.4190051555633545, + "learning_rate": 1.4842696629213485e-05, + "loss": 0.3979, + "step": 1380 + }, + { + "epoch": 7.86, + "grad_norm": 1.9085837602615356, + "learning_rate": 1.4767790262172286e-05, + "loss": 0.3894, + "step": 1400 + }, + { + "epoch": 7.97, + "grad_norm": 1.7573003768920898, + "learning_rate": 1.4692883895131088e-05, + "loss": 0.3751, + "step": 1420 + }, + { + "epoch": 8.08, + "grad_norm": 1.8974506855010986, + "learning_rate": 1.4617977528089889e-05, + "loss": 0.3936, + "step": 1440 + }, + { + "epoch": 8.19, + "grad_norm": 1.3843660354614258, + "learning_rate": 1.454307116104869e-05, + "loss": 0.3848, + "step": 1460 + }, + { + "epoch": 8.31, + "grad_norm": 1.525007724761963, + "learning_rate": 1.4468164794007492e-05, + "loss": 0.3552, + "step": 1480 + }, + { + "epoch": 8.42, + "grad_norm": 2.1665101051330566, + "learning_rate": 1.4393258426966291e-05, + "loss": 0.3547, + "step": 1500 + }, + { + "epoch": 8.53, + "grad_norm": 3.3614535331726074, + "learning_rate": 1.4318352059925096e-05, + "loss": 0.3771, + "step": 1520 + }, + { + "epoch": 8.64, + "grad_norm": 1.746299386024475, + "learning_rate": 1.4243445692883898e-05, + "loss": 0.396, + "step": 1540 + }, + { + "epoch": 8.75, + "grad_norm": 1.9144684076309204, + "learning_rate": 1.4168539325842699e-05, + "loss": 0.3748, + "step": 1560 + }, + { + "epoch": 8.87, + "grad_norm": 1.9617277383804321, + "learning_rate": 1.40936329588015e-05, + "loss": 0.3504, + "step": 1580 + }, + { + "epoch": 8.98, + "grad_norm": 2.69067645072937, + "learning_rate": 1.4018726591760302e-05, + "loss": 0.3477, + "step": 1600 + }, + { + "epoch": 9.09, + "grad_norm": 2.142008066177368, + "learning_rate": 1.3943820224719103e-05, + "loss": 0.3539, + "step": 1620 + }, + { + "epoch": 9.2, + "grad_norm": 1.7684266567230225, + "learning_rate": 1.3868913857677904e-05, + "loss": 0.3576, + "step": 1640 + }, + { + "epoch": 9.32, + "grad_norm": 1.4222275018692017, + "learning_rate": 1.3794007490636706e-05, + "loss": 0.3839, + "step": 1660 + }, + { + "epoch": 9.43, + "grad_norm": 2.0622501373291016, + "learning_rate": 1.3719101123595507e-05, + "loss": 0.3278, + "step": 1680 + }, + { + "epoch": 9.54, + "grad_norm": 1.639147400856018, + "learning_rate": 1.3644194756554308e-05, + "loss": 0.3374, + "step": 1700 + }, + { + "epoch": 9.65, + "grad_norm": 2.093045473098755, + "learning_rate": 1.356928838951311e-05, + "loss": 0.3535, + "step": 1720 + }, + { + "epoch": 9.76, + "grad_norm": 1.3492937088012695, + "learning_rate": 1.3494382022471911e-05, + "loss": 0.3105, + "step": 1740 + }, + { + "epoch": 9.88, + "grad_norm": 1.585205316543579, + "learning_rate": 1.3419475655430714e-05, + "loss": 0.3181, + "step": 1760 + }, + { + "epoch": 9.99, + "grad_norm": 2.8895344734191895, + "learning_rate": 1.3344569288389515e-05, + "loss": 0.3473, + "step": 1780 + }, + { + "epoch": 10.1, + "grad_norm": 1.7224748134613037, + "learning_rate": 1.3269662921348317e-05, + "loss": 0.3524, + "step": 1800 + }, + { + "epoch": 10.21, + "grad_norm": 2.1029868125915527, + "learning_rate": 1.3194756554307118e-05, + "loss": 0.3408, + "step": 1820 + }, + { + "epoch": 10.33, + "grad_norm": 2.434016227722168, + "learning_rate": 1.311985018726592e-05, + "loss": 0.3266, + "step": 1840 + }, + { + "epoch": 10.44, + "grad_norm": 1.953553318977356, + "learning_rate": 1.304494382022472e-05, + "loss": 0.2844, + "step": 1860 + }, + { + "epoch": 10.55, + "grad_norm": 2.5946218967437744, + "learning_rate": 1.2970037453183522e-05, + "loss": 0.3225, + "step": 1880 + }, + { + "epoch": 10.66, + "grad_norm": 2.5305733680725098, + "learning_rate": 1.2895131086142323e-05, + "loss": 0.3183, + "step": 1900 + }, + { + "epoch": 10.78, + "grad_norm": 3.56726336479187, + "learning_rate": 1.2820224719101125e-05, + "loss": 0.2944, + "step": 1920 + }, + { + "epoch": 10.89, + "grad_norm": 1.9687740802764893, + "learning_rate": 1.2745318352059926e-05, + "loss": 0.3411, + "step": 1940 + }, + { + "epoch": 11.0, + "grad_norm": 1.6027730703353882, + "learning_rate": 1.2670411985018727e-05, + "loss": 0.2949, + "step": 1960 + }, + { + "epoch": 11.11, + "grad_norm": 1.8739397525787354, + "learning_rate": 1.2595505617977529e-05, + "loss": 0.2716, + "step": 1980 + }, + { + "epoch": 11.22, + "grad_norm": 1.6741198301315308, + "learning_rate": 1.2520599250936332e-05, + "loss": 0.3334, + "step": 2000 + }, + { + "epoch": 11.34, + "grad_norm": 1.950945496559143, + "learning_rate": 1.2445692883895133e-05, + "loss": 0.3291, + "step": 2020 + }, + { + "epoch": 11.45, + "grad_norm": 1.9362170696258545, + "learning_rate": 1.2370786516853935e-05, + "loss": 0.2716, + "step": 2040 + }, + { + "epoch": 11.56, + "grad_norm": 1.6201746463775635, + "learning_rate": 1.2295880149812736e-05, + "loss": 0.2893, + "step": 2060 + }, + { + "epoch": 11.67, + "grad_norm": 3.488088607788086, + "learning_rate": 1.2220973782771537e-05, + "loss": 0.3239, + "step": 2080 + }, + { + "epoch": 11.79, + "grad_norm": 2.4608683586120605, + "learning_rate": 1.2146067415730339e-05, + "loss": 0.271, + "step": 2100 + }, + { + "epoch": 11.9, + "grad_norm": 1.5321098566055298, + "learning_rate": 1.207116104868914e-05, + "loss": 0.2876, + "step": 2120 + }, + { + "epoch": 12.01, + "grad_norm": 1.8334771394729614, + "learning_rate": 1.1996254681647941e-05, + "loss": 0.3066, + "step": 2140 + }, + { + "epoch": 12.12, + "grad_norm": 1.9506254196166992, + "learning_rate": 1.1921348314606743e-05, + "loss": 0.3023, + "step": 2160 + }, + { + "epoch": 12.23, + "grad_norm": 2.9073598384857178, + "learning_rate": 1.1846441947565544e-05, + "loss": 0.3152, + "step": 2180 + }, + { + "epoch": 12.35, + "grad_norm": 1.6023261547088623, + "learning_rate": 1.1771535580524345e-05, + "loss": 0.248, + "step": 2200 + }, + { + "epoch": 12.46, + "grad_norm": 1.7954633235931396, + "learning_rate": 1.1696629213483147e-05, + "loss": 0.2666, + "step": 2220 + }, + { + "epoch": 12.57, + "grad_norm": 2.0331828594207764, + "learning_rate": 1.162172284644195e-05, + "loss": 0.2878, + "step": 2240 + }, + { + "epoch": 12.68, + "grad_norm": 1.656420350074768, + "learning_rate": 1.1546816479400751e-05, + "loss": 0.2805, + "step": 2260 + }, + { + "epoch": 12.8, + "grad_norm": 1.5245873928070068, + "learning_rate": 1.1471910112359552e-05, + "loss": 0.2792, + "step": 2280 + }, + { + "epoch": 12.91, + "grad_norm": 2.6713974475860596, + "learning_rate": 1.1397003745318354e-05, + "loss": 0.2841, + "step": 2300 + }, + { + "epoch": 13.02, + "grad_norm": 1.268479347229004, + "learning_rate": 1.1322097378277155e-05, + "loss": 0.2708, + "step": 2320 + }, + { + "epoch": 13.13, + "grad_norm": 2.2990434169769287, + "learning_rate": 1.1247191011235956e-05, + "loss": 0.2649, + "step": 2340 + }, + { + "epoch": 13.24, + "grad_norm": 2.351956367492676, + "learning_rate": 1.1172284644194758e-05, + "loss": 0.281, + "step": 2360 + }, + { + "epoch": 13.36, + "grad_norm": 1.796783208847046, + "learning_rate": 1.1097378277153559e-05, + "loss": 0.2725, + "step": 2380 + }, + { + "epoch": 13.47, + "grad_norm": 1.7035847902297974, + "learning_rate": 1.102247191011236e-05, + "loss": 0.2799, + "step": 2400 + }, + { + "epoch": 13.58, + "grad_norm": 2.0395431518554688, + "learning_rate": 1.0947565543071162e-05, + "loss": 0.239, + "step": 2420 + }, + { + "epoch": 13.69, + "grad_norm": 1.8008232116699219, + "learning_rate": 1.0872659176029963e-05, + "loss": 0.2553, + "step": 2440 + }, + { + "epoch": 13.81, + "grad_norm": 2.0559043884277344, + "learning_rate": 1.0797752808988765e-05, + "loss": 0.2464, + "step": 2460 + }, + { + "epoch": 13.92, + "grad_norm": 1.8673292398452759, + "learning_rate": 1.0722846441947568e-05, + "loss": 0.2699, + "step": 2480 + }, + { + "epoch": 14.03, + "grad_norm": 1.6819398403167725, + "learning_rate": 1.0647940074906369e-05, + "loss": 0.2566, + "step": 2500 + }, + { + "epoch": 14.14, + "grad_norm": 1.9703686237335205, + "learning_rate": 1.057303370786517e-05, + "loss": 0.2807, + "step": 2520 + }, + { + "epoch": 14.25, + "grad_norm": 2.028834819793701, + "learning_rate": 1.0498127340823972e-05, + "loss": 0.2392, + "step": 2540 + }, + { + "epoch": 14.37, + "grad_norm": 2.2455177307128906, + "learning_rate": 1.0423220973782773e-05, + "loss": 0.247, + "step": 2560 + }, + { + "epoch": 14.48, + "grad_norm": 1.8078291416168213, + "learning_rate": 1.0348314606741574e-05, + "loss": 0.2552, + "step": 2580 + }, + { + "epoch": 14.59, + "grad_norm": 2.166729211807251, + "learning_rate": 1.0273408239700376e-05, + "loss": 0.2466, + "step": 2600 + }, + { + "epoch": 14.7, + "grad_norm": 2.710556745529175, + "learning_rate": 1.0198501872659177e-05, + "loss": 0.2506, + "step": 2620 + }, + { + "epoch": 14.82, + "grad_norm": 2.1344659328460693, + "learning_rate": 1.0123595505617978e-05, + "loss": 0.2388, + "step": 2640 + }, + { + "epoch": 14.93, + "grad_norm": 1.595842719078064, + "learning_rate": 1.004868913857678e-05, + "loss": 0.2553, + "step": 2660 + }, + { + "epoch": 15.04, + "grad_norm": 1.5458731651306152, + "learning_rate": 9.973782771535581e-06, + "loss": 0.2478, + "step": 2680 + }, + { + "epoch": 15.15, + "grad_norm": 1.9514356851577759, + "learning_rate": 9.898876404494382e-06, + "loss": 0.234, + "step": 2700 + }, + { + "epoch": 15.26, + "grad_norm": 2.1551694869995117, + "learning_rate": 9.823970037453184e-06, + "loss": 0.251, + "step": 2720 + }, + { + "epoch": 15.38, + "grad_norm": 2.08258318901062, + "learning_rate": 9.749063670411985e-06, + "loss": 0.2511, + "step": 2740 + }, + { + "epoch": 15.49, + "grad_norm": 1.581690788269043, + "learning_rate": 9.674157303370786e-06, + "loss": 0.2185, + "step": 2760 + }, + { + "epoch": 15.6, + "grad_norm": 2.2121975421905518, + "learning_rate": 9.599250936329588e-06, + "loss": 0.2161, + "step": 2780 + }, + { + "epoch": 15.71, + "grad_norm": 1.5077215433120728, + "learning_rate": 9.52434456928839e-06, + "loss": 0.2308, + "step": 2800 + }, + { + "epoch": 15.83, + "grad_norm": 2.57951021194458, + "learning_rate": 9.449438202247192e-06, + "loss": 0.2299, + "step": 2820 + }, + { + "epoch": 15.94, + "grad_norm": 1.6634414196014404, + "learning_rate": 9.374531835205993e-06, + "loss": 0.2576, + "step": 2840 + }, + { + "epoch": 16.05, + "grad_norm": 1.9692113399505615, + "learning_rate": 9.299625468164795e-06, + "loss": 0.2395, + "step": 2860 + }, + { + "epoch": 16.16, + "grad_norm": 1.9327415227890015, + "learning_rate": 9.224719101123596e-06, + "loss": 0.241, + "step": 2880 + }, + { + "epoch": 16.27, + "grad_norm": 1.7675727605819702, + "learning_rate": 9.149812734082398e-06, + "loss": 0.2201, + "step": 2900 + }, + { + "epoch": 16.39, + "grad_norm": 1.9511345624923706, + "learning_rate": 9.074906367041199e-06, + "loss": 0.2171, + "step": 2920 + }, + { + "epoch": 16.5, + "grad_norm": 1.7937383651733398, + "learning_rate": 9e-06, + "loss": 0.2286, + "step": 2940 + }, + { + "epoch": 16.61, + "grad_norm": 1.79076087474823, + "learning_rate": 8.925093632958802e-06, + "loss": 0.2479, + "step": 2960 + }, + { + "epoch": 16.72, + "grad_norm": 2.4045145511627197, + "learning_rate": 8.850187265917603e-06, + "loss": 0.2153, + "step": 2980 + }, + { + "epoch": 16.84, + "grad_norm": 2.1934499740600586, + "learning_rate": 8.775280898876404e-06, + "loss": 0.2361, + "step": 3000 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 7.7973833613312e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/checkpoint-3500/README.md b/checkpoint-3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-3500/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-3500/adapter_config.json b/checkpoint-3500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-3500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3500/adapter_model.safetensors b/checkpoint-3500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fde586dcd4d15378d802607da6cecc493211f3cd --- /dev/null +++ b/checkpoint-3500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:579a43d26a6a19d21c576c5316a79d4f42e833e6891a97c9c1e99e0db8fa6631 +size 16794200 diff --git a/checkpoint-3500/optimizer.pt b/checkpoint-3500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f54872ec2fc24c7a6889f4b8156df0f4cce5fa5c --- /dev/null +++ b/checkpoint-3500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94ea1c695c77353d53125d18efe53a5c98033f75aa4bb43fd4661b3d9f1f7683 +size 33662074 diff --git a/checkpoint-3500/rng_state.pth b/checkpoint-3500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..eaf6ba9e91dd7dd2786ad0ffb93813087fad67d7 --- /dev/null +++ b/checkpoint-3500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36312752df00181a36fce8a132c5e65726626dfaa46115098e729752ec3f2007 +size 14244 diff --git a/checkpoint-3500/scheduler.pt b/checkpoint-3500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..94cf001d622fc1ed30bb537529d5bea415985d80 --- /dev/null +++ b/checkpoint-3500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34747f1b847b210032a267ca34d4f4848d59d492b90b28f1381ca49e0b518b5b +size 1064 diff --git a/checkpoint-3500/trainer_state.json b/checkpoint-3500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..84f128f6ec93dc80cd8d12e283c6081c9964a4fe --- /dev/null +++ b/checkpoint-3500/trainer_state.json @@ -0,0 +1,1246 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 19.642230796211855, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + }, + { + "epoch": 2.92, + "grad_norm": 3.929158926010132, + "learning_rate": 1.806367041198502e-05, + "loss": 0.6395, + "step": 520 + }, + { + "epoch": 3.03, + "grad_norm": 1.7316572666168213, + "learning_rate": 1.7988764044943823e-05, + "loss": 0.664, + "step": 540 + }, + { + "epoch": 3.14, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.7913857677902623e-05, + "loss": 0.6469, + "step": 560 + }, + { + "epoch": 3.25, + "grad_norm": 1.5258549451828003, + "learning_rate": 1.7838951310861426e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 3.37, + "grad_norm": 1.5486094951629639, + "learning_rate": 1.7764044943820226e-05, + "loss": 0.566, + "step": 600 + }, + { + "epoch": 3.48, + "grad_norm": 1.5657902956008911, + "learning_rate": 1.768913857677903e-05, + "loss": 0.6166, + "step": 620 + }, + { + "epoch": 3.59, + "grad_norm": 1.5971391201019287, + "learning_rate": 1.761423220973783e-05, + "loss": 0.5973, + "step": 640 + }, + { + "epoch": 3.7, + "grad_norm": 1.333030343055725, + "learning_rate": 1.753932584269663e-05, + "loss": 0.6117, + "step": 660 + }, + { + "epoch": 3.82, + "grad_norm": 1.4425445795059204, + "learning_rate": 1.746441947565543e-05, + "loss": 0.5702, + "step": 680 + }, + { + "epoch": 3.93, + "grad_norm": 1.4773032665252686, + "learning_rate": 1.7389513108614234e-05, + "loss": 0.5465, + "step": 700 + }, + { + "epoch": 4.04, + "grad_norm": 1.3328267335891724, + "learning_rate": 1.7314606741573034e-05, + "loss": 0.5379, + "step": 720 + }, + { + "epoch": 4.15, + "grad_norm": 1.6961455345153809, + "learning_rate": 1.7239700374531837e-05, + "loss": 0.5492, + "step": 740 + }, + { + "epoch": 4.27, + "grad_norm": 1.4636189937591553, + "learning_rate": 1.7164794007490637e-05, + "loss": 0.547, + "step": 760 + }, + { + "epoch": 4.38, + "grad_norm": 2.1686649322509766, + "learning_rate": 1.708988764044944e-05, + "loss": 0.5424, + "step": 780 + }, + { + "epoch": 4.49, + "grad_norm": 1.219388723373413, + "learning_rate": 1.7014981273408243e-05, + "loss": 0.5373, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 1.5566452741622925, + "learning_rate": 1.6940074906367042e-05, + "loss": 0.4944, + "step": 820 + }, + { + "epoch": 4.71, + "grad_norm": 1.598917841911316, + "learning_rate": 1.6865168539325845e-05, + "loss": 0.5036, + "step": 840 + }, + { + "epoch": 4.83, + "grad_norm": 1.5281039476394653, + "learning_rate": 1.6790262172284645e-05, + "loss": 0.5215, + "step": 860 + }, + { + "epoch": 4.94, + "grad_norm": 1.7123130559921265, + "learning_rate": 1.6715355805243448e-05, + "loss": 0.5362, + "step": 880 + }, + { + "epoch": 5.05, + "grad_norm": 1.543447732925415, + "learning_rate": 1.6640449438202248e-05, + "loss": 0.5379, + "step": 900 + }, + { + "epoch": 5.16, + "grad_norm": 2.4190192222595215, + "learning_rate": 1.656554307116105e-05, + "loss": 0.4921, + "step": 920 + }, + { + "epoch": 5.28, + "grad_norm": 2.190906047821045, + "learning_rate": 1.649063670411985e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 5.39, + "grad_norm": 2.113476514816284, + "learning_rate": 1.6415730337078653e-05, + "loss": 0.4914, + "step": 960 + }, + { + "epoch": 5.5, + "grad_norm": 1.8785656690597534, + "learning_rate": 1.6340823970037453e-05, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 5.61, + "grad_norm": 1.3745977878570557, + "learning_rate": 1.6265917602996256e-05, + "loss": 0.4697, + "step": 1000 + }, + { + "epoch": 5.72, + "grad_norm": 1.7874308824539185, + "learning_rate": 1.6191011235955056e-05, + "loss": 0.4625, + "step": 1020 + }, + { + "epoch": 5.84, + "grad_norm": 1.4448940753936768, + "learning_rate": 1.611610486891386e-05, + "loss": 0.4764, + "step": 1040 + }, + { + "epoch": 5.95, + "grad_norm": 2.278655767440796, + "learning_rate": 1.6041198501872662e-05, + "loss": 0.4221, + "step": 1060 + }, + { + "epoch": 6.06, + "grad_norm": 1.8602409362792969, + "learning_rate": 1.596629213483146e-05, + "loss": 0.4731, + "step": 1080 + }, + { + "epoch": 6.17, + "grad_norm": 1.884373426437378, + "learning_rate": 1.5891385767790265e-05, + "loss": 0.4241, + "step": 1100 + }, + { + "epoch": 6.29, + "grad_norm": 2.0259287357330322, + "learning_rate": 1.5816479400749064e-05, + "loss": 0.4368, + "step": 1120 + }, + { + "epoch": 6.4, + "grad_norm": 1.812462329864502, + "learning_rate": 1.5741573033707867e-05, + "loss": 0.442, + "step": 1140 + }, + { + "epoch": 6.51, + "grad_norm": 1.934327483177185, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.4195, + "step": 1160 + }, + { + "epoch": 6.62, + "grad_norm": 1.6152955293655396, + "learning_rate": 1.559176029962547e-05, + "loss": 0.4374, + "step": 1180 + }, + { + "epoch": 6.73, + "grad_norm": 2.7782068252563477, + "learning_rate": 1.551685393258427e-05, + "loss": 0.4231, + "step": 1200 + }, + { + "epoch": 6.85, + "grad_norm": 2.372976303100586, + "learning_rate": 1.5441947565543073e-05, + "loss": 0.444, + "step": 1220 + }, + { + "epoch": 6.96, + "grad_norm": 2.171353816986084, + "learning_rate": 1.5367041198501872e-05, + "loss": 0.4389, + "step": 1240 + }, + { + "epoch": 7.07, + "grad_norm": 1.3093984127044678, + "learning_rate": 1.5292134831460675e-05, + "loss": 0.4301, + "step": 1260 + }, + { + "epoch": 7.18, + "grad_norm": 2.267932176589966, + "learning_rate": 1.5217228464419478e-05, + "loss": 0.4046, + "step": 1280 + }, + { + "epoch": 7.3, + "grad_norm": 1.5326164960861206, + "learning_rate": 1.514232209737828e-05, + "loss": 0.4068, + "step": 1300 + }, + { + "epoch": 7.41, + "grad_norm": 3.1525979042053223, + "learning_rate": 1.5067415730337081e-05, + "loss": 0.3847, + "step": 1320 + }, + { + "epoch": 7.52, + "grad_norm": 2.081890106201172, + "learning_rate": 1.4992509363295882e-05, + "loss": 0.4126, + "step": 1340 + }, + { + "epoch": 7.63, + "grad_norm": 2.5701358318328857, + "learning_rate": 1.4917602996254684e-05, + "loss": 0.4065, + "step": 1360 + }, + { + "epoch": 7.74, + "grad_norm": 1.4190051555633545, + "learning_rate": 1.4842696629213485e-05, + "loss": 0.3979, + "step": 1380 + }, + { + "epoch": 7.86, + "grad_norm": 1.9085837602615356, + "learning_rate": 1.4767790262172286e-05, + "loss": 0.3894, + "step": 1400 + }, + { + "epoch": 7.97, + "grad_norm": 1.7573003768920898, + "learning_rate": 1.4692883895131088e-05, + "loss": 0.3751, + "step": 1420 + }, + { + "epoch": 8.08, + "grad_norm": 1.8974506855010986, + "learning_rate": 1.4617977528089889e-05, + "loss": 0.3936, + "step": 1440 + }, + { + "epoch": 8.19, + "grad_norm": 1.3843660354614258, + "learning_rate": 1.454307116104869e-05, + "loss": 0.3848, + "step": 1460 + }, + { + "epoch": 8.31, + "grad_norm": 1.525007724761963, + "learning_rate": 1.4468164794007492e-05, + "loss": 0.3552, + "step": 1480 + }, + { + "epoch": 8.42, + "grad_norm": 2.1665101051330566, + "learning_rate": 1.4393258426966291e-05, + "loss": 0.3547, + "step": 1500 + }, + { + "epoch": 8.53, + "grad_norm": 3.3614535331726074, + "learning_rate": 1.4318352059925096e-05, + "loss": 0.3771, + "step": 1520 + }, + { + "epoch": 8.64, + "grad_norm": 1.746299386024475, + "learning_rate": 1.4243445692883898e-05, + "loss": 0.396, + "step": 1540 + }, + { + "epoch": 8.75, + "grad_norm": 1.9144684076309204, + "learning_rate": 1.4168539325842699e-05, + "loss": 0.3748, + "step": 1560 + }, + { + "epoch": 8.87, + "grad_norm": 1.9617277383804321, + "learning_rate": 1.40936329588015e-05, + "loss": 0.3504, + "step": 1580 + }, + { + "epoch": 8.98, + "grad_norm": 2.69067645072937, + "learning_rate": 1.4018726591760302e-05, + "loss": 0.3477, + "step": 1600 + }, + { + "epoch": 9.09, + "grad_norm": 2.142008066177368, + "learning_rate": 1.3943820224719103e-05, + "loss": 0.3539, + "step": 1620 + }, + { + "epoch": 9.2, + "grad_norm": 1.7684266567230225, + "learning_rate": 1.3868913857677904e-05, + "loss": 0.3576, + "step": 1640 + }, + { + "epoch": 9.32, + "grad_norm": 1.4222275018692017, + "learning_rate": 1.3794007490636706e-05, + "loss": 0.3839, + "step": 1660 + }, + { + "epoch": 9.43, + "grad_norm": 2.0622501373291016, + "learning_rate": 1.3719101123595507e-05, + "loss": 0.3278, + "step": 1680 + }, + { + "epoch": 9.54, + "grad_norm": 1.639147400856018, + "learning_rate": 1.3644194756554308e-05, + "loss": 0.3374, + "step": 1700 + }, + { + "epoch": 9.65, + "grad_norm": 2.093045473098755, + "learning_rate": 1.356928838951311e-05, + "loss": 0.3535, + "step": 1720 + }, + { + "epoch": 9.76, + "grad_norm": 1.3492937088012695, + "learning_rate": 1.3494382022471911e-05, + "loss": 0.3105, + "step": 1740 + }, + { + "epoch": 9.88, + "grad_norm": 1.585205316543579, + "learning_rate": 1.3419475655430714e-05, + "loss": 0.3181, + "step": 1760 + }, + { + "epoch": 9.99, + "grad_norm": 2.8895344734191895, + "learning_rate": 1.3344569288389515e-05, + "loss": 0.3473, + "step": 1780 + }, + { + "epoch": 10.1, + "grad_norm": 1.7224748134613037, + "learning_rate": 1.3269662921348317e-05, + "loss": 0.3524, + "step": 1800 + }, + { + "epoch": 10.21, + "grad_norm": 2.1029868125915527, + "learning_rate": 1.3194756554307118e-05, + "loss": 0.3408, + "step": 1820 + }, + { + "epoch": 10.33, + "grad_norm": 2.434016227722168, + "learning_rate": 1.311985018726592e-05, + "loss": 0.3266, + "step": 1840 + }, + { + "epoch": 10.44, + "grad_norm": 1.953553318977356, + "learning_rate": 1.304494382022472e-05, + "loss": 0.2844, + "step": 1860 + }, + { + "epoch": 10.55, + "grad_norm": 2.5946218967437744, + "learning_rate": 1.2970037453183522e-05, + "loss": 0.3225, + "step": 1880 + }, + { + "epoch": 10.66, + "grad_norm": 2.5305733680725098, + "learning_rate": 1.2895131086142323e-05, + "loss": 0.3183, + "step": 1900 + }, + { + "epoch": 10.78, + "grad_norm": 3.56726336479187, + "learning_rate": 1.2820224719101125e-05, + "loss": 0.2944, + "step": 1920 + }, + { + "epoch": 10.89, + "grad_norm": 1.9687740802764893, + "learning_rate": 1.2745318352059926e-05, + "loss": 0.3411, + "step": 1940 + }, + { + "epoch": 11.0, + "grad_norm": 1.6027730703353882, + "learning_rate": 1.2670411985018727e-05, + "loss": 0.2949, + "step": 1960 + }, + { + "epoch": 11.11, + "grad_norm": 1.8739397525787354, + "learning_rate": 1.2595505617977529e-05, + "loss": 0.2716, + "step": 1980 + }, + { + "epoch": 11.22, + "grad_norm": 1.6741198301315308, + "learning_rate": 1.2520599250936332e-05, + "loss": 0.3334, + "step": 2000 + }, + { + "epoch": 11.34, + "grad_norm": 1.950945496559143, + "learning_rate": 1.2445692883895133e-05, + "loss": 0.3291, + "step": 2020 + }, + { + "epoch": 11.45, + "grad_norm": 1.9362170696258545, + "learning_rate": 1.2370786516853935e-05, + "loss": 0.2716, + "step": 2040 + }, + { + "epoch": 11.56, + "grad_norm": 1.6201746463775635, + "learning_rate": 1.2295880149812736e-05, + "loss": 0.2893, + "step": 2060 + }, + { + "epoch": 11.67, + "grad_norm": 3.488088607788086, + "learning_rate": 1.2220973782771537e-05, + "loss": 0.3239, + "step": 2080 + }, + { + "epoch": 11.79, + "grad_norm": 2.4608683586120605, + "learning_rate": 1.2146067415730339e-05, + "loss": 0.271, + "step": 2100 + }, + { + "epoch": 11.9, + "grad_norm": 1.5321098566055298, + "learning_rate": 1.207116104868914e-05, + "loss": 0.2876, + "step": 2120 + }, + { + "epoch": 12.01, + "grad_norm": 1.8334771394729614, + "learning_rate": 1.1996254681647941e-05, + "loss": 0.3066, + "step": 2140 + }, + { + "epoch": 12.12, + "grad_norm": 1.9506254196166992, + "learning_rate": 1.1921348314606743e-05, + "loss": 0.3023, + "step": 2160 + }, + { + "epoch": 12.23, + "grad_norm": 2.9073598384857178, + "learning_rate": 1.1846441947565544e-05, + "loss": 0.3152, + "step": 2180 + }, + { + "epoch": 12.35, + "grad_norm": 1.6023261547088623, + "learning_rate": 1.1771535580524345e-05, + "loss": 0.248, + "step": 2200 + }, + { + "epoch": 12.46, + "grad_norm": 1.7954633235931396, + "learning_rate": 1.1696629213483147e-05, + "loss": 0.2666, + "step": 2220 + }, + { + "epoch": 12.57, + "grad_norm": 2.0331828594207764, + "learning_rate": 1.162172284644195e-05, + "loss": 0.2878, + "step": 2240 + }, + { + "epoch": 12.68, + "grad_norm": 1.656420350074768, + "learning_rate": 1.1546816479400751e-05, + "loss": 0.2805, + "step": 2260 + }, + { + "epoch": 12.8, + "grad_norm": 1.5245873928070068, + "learning_rate": 1.1471910112359552e-05, + "loss": 0.2792, + "step": 2280 + }, + { + "epoch": 12.91, + "grad_norm": 2.6713974475860596, + "learning_rate": 1.1397003745318354e-05, + "loss": 0.2841, + "step": 2300 + }, + { + "epoch": 13.02, + "grad_norm": 1.268479347229004, + "learning_rate": 1.1322097378277155e-05, + "loss": 0.2708, + "step": 2320 + }, + { + "epoch": 13.13, + "grad_norm": 2.2990434169769287, + "learning_rate": 1.1247191011235956e-05, + "loss": 0.2649, + "step": 2340 + }, + { + "epoch": 13.24, + "grad_norm": 2.351956367492676, + "learning_rate": 1.1172284644194758e-05, + "loss": 0.281, + "step": 2360 + }, + { + "epoch": 13.36, + "grad_norm": 1.796783208847046, + "learning_rate": 1.1097378277153559e-05, + "loss": 0.2725, + "step": 2380 + }, + { + "epoch": 13.47, + "grad_norm": 1.7035847902297974, + "learning_rate": 1.102247191011236e-05, + "loss": 0.2799, + "step": 2400 + }, + { + "epoch": 13.58, + "grad_norm": 2.0395431518554688, + "learning_rate": 1.0947565543071162e-05, + "loss": 0.239, + "step": 2420 + }, + { + "epoch": 13.69, + "grad_norm": 1.8008232116699219, + "learning_rate": 1.0872659176029963e-05, + "loss": 0.2553, + "step": 2440 + }, + { + "epoch": 13.81, + "grad_norm": 2.0559043884277344, + "learning_rate": 1.0797752808988765e-05, + "loss": 0.2464, + "step": 2460 + }, + { + "epoch": 13.92, + "grad_norm": 1.8673292398452759, + "learning_rate": 1.0722846441947568e-05, + "loss": 0.2699, + "step": 2480 + }, + { + "epoch": 14.03, + "grad_norm": 1.6819398403167725, + "learning_rate": 1.0647940074906369e-05, + "loss": 0.2566, + "step": 2500 + }, + { + "epoch": 14.14, + "grad_norm": 1.9703686237335205, + "learning_rate": 1.057303370786517e-05, + "loss": 0.2807, + "step": 2520 + }, + { + "epoch": 14.25, + "grad_norm": 2.028834819793701, + "learning_rate": 1.0498127340823972e-05, + "loss": 0.2392, + "step": 2540 + }, + { + "epoch": 14.37, + "grad_norm": 2.2455177307128906, + "learning_rate": 1.0423220973782773e-05, + "loss": 0.247, + "step": 2560 + }, + { + "epoch": 14.48, + "grad_norm": 1.8078291416168213, + "learning_rate": 1.0348314606741574e-05, + "loss": 0.2552, + "step": 2580 + }, + { + "epoch": 14.59, + "grad_norm": 2.166729211807251, + "learning_rate": 1.0273408239700376e-05, + "loss": 0.2466, + "step": 2600 + }, + { + "epoch": 14.7, + "grad_norm": 2.710556745529175, + "learning_rate": 1.0198501872659177e-05, + "loss": 0.2506, + "step": 2620 + }, + { + "epoch": 14.82, + "grad_norm": 2.1344659328460693, + "learning_rate": 1.0123595505617978e-05, + "loss": 0.2388, + "step": 2640 + }, + { + "epoch": 14.93, + "grad_norm": 1.595842719078064, + "learning_rate": 1.004868913857678e-05, + "loss": 0.2553, + "step": 2660 + }, + { + "epoch": 15.04, + "grad_norm": 1.5458731651306152, + "learning_rate": 9.973782771535581e-06, + "loss": 0.2478, + "step": 2680 + }, + { + "epoch": 15.15, + "grad_norm": 1.9514356851577759, + "learning_rate": 9.898876404494382e-06, + "loss": 0.234, + "step": 2700 + }, + { + "epoch": 15.26, + "grad_norm": 2.1551694869995117, + "learning_rate": 9.823970037453184e-06, + "loss": 0.251, + "step": 2720 + }, + { + "epoch": 15.38, + "grad_norm": 2.08258318901062, + "learning_rate": 9.749063670411985e-06, + "loss": 0.2511, + "step": 2740 + }, + { + "epoch": 15.49, + "grad_norm": 1.581690788269043, + "learning_rate": 9.674157303370786e-06, + "loss": 0.2185, + "step": 2760 + }, + { + "epoch": 15.6, + "grad_norm": 2.2121975421905518, + "learning_rate": 9.599250936329588e-06, + "loss": 0.2161, + "step": 2780 + }, + { + "epoch": 15.71, + "grad_norm": 1.5077215433120728, + "learning_rate": 9.52434456928839e-06, + "loss": 0.2308, + "step": 2800 + }, + { + "epoch": 15.83, + "grad_norm": 2.57951021194458, + "learning_rate": 9.449438202247192e-06, + "loss": 0.2299, + "step": 2820 + }, + { + "epoch": 15.94, + "grad_norm": 1.6634414196014404, + "learning_rate": 9.374531835205993e-06, + "loss": 0.2576, + "step": 2840 + }, + { + "epoch": 16.05, + "grad_norm": 1.9692113399505615, + "learning_rate": 9.299625468164795e-06, + "loss": 0.2395, + "step": 2860 + }, + { + "epoch": 16.16, + "grad_norm": 1.9327415227890015, + "learning_rate": 9.224719101123596e-06, + "loss": 0.241, + "step": 2880 + }, + { + "epoch": 16.27, + "grad_norm": 1.7675727605819702, + "learning_rate": 9.149812734082398e-06, + "loss": 0.2201, + "step": 2900 + }, + { + "epoch": 16.39, + "grad_norm": 1.9511345624923706, + "learning_rate": 9.074906367041199e-06, + "loss": 0.2171, + "step": 2920 + }, + { + "epoch": 16.5, + "grad_norm": 1.7937383651733398, + "learning_rate": 9e-06, + "loss": 0.2286, + "step": 2940 + }, + { + "epoch": 16.61, + "grad_norm": 1.79076087474823, + "learning_rate": 8.925093632958802e-06, + "loss": 0.2479, + "step": 2960 + }, + { + "epoch": 16.72, + "grad_norm": 2.4045145511627197, + "learning_rate": 8.850187265917603e-06, + "loss": 0.2153, + "step": 2980 + }, + { + "epoch": 16.84, + "grad_norm": 2.1934499740600586, + "learning_rate": 8.775280898876404e-06, + "loss": 0.2361, + "step": 3000 + }, + { + "epoch": 16.95, + "grad_norm": 1.923170804977417, + "learning_rate": 8.700374531835206e-06, + "loss": 0.2146, + "step": 3020 + }, + { + "epoch": 17.06, + "grad_norm": 2.1610753536224365, + "learning_rate": 8.625468164794009e-06, + "loss": 0.2281, + "step": 3040 + }, + { + "epoch": 17.17, + "grad_norm": 2.1105706691741943, + "learning_rate": 8.55056179775281e-06, + "loss": 0.2403, + "step": 3060 + }, + { + "epoch": 17.29, + "grad_norm": 1.979177474975586, + "learning_rate": 8.475655430711611e-06, + "loss": 0.1734, + "step": 3080 + }, + { + "epoch": 17.4, + "grad_norm": 2.040055274963379, + "learning_rate": 8.400749063670413e-06, + "loss": 0.2393, + "step": 3100 + }, + { + "epoch": 17.51, + "grad_norm": 1.8687106370925903, + "learning_rate": 8.325842696629214e-06, + "loss": 0.2346, + "step": 3120 + }, + { + "epoch": 17.62, + "grad_norm": 1.7447230815887451, + "learning_rate": 8.250936329588015e-06, + "loss": 0.2279, + "step": 3140 + }, + { + "epoch": 17.73, + "grad_norm": 2.9035825729370117, + "learning_rate": 8.176029962546818e-06, + "loss": 0.2049, + "step": 3160 + }, + { + "epoch": 17.85, + "grad_norm": 2.1024608612060547, + "learning_rate": 8.101123595505618e-06, + "loss": 0.1962, + "step": 3180 + }, + { + "epoch": 17.96, + "grad_norm": 2.7913131713867188, + "learning_rate": 8.02621722846442e-06, + "loss": 0.2081, + "step": 3200 + }, + { + "epoch": 18.07, + "grad_norm": 2.0668814182281494, + "learning_rate": 7.95131086142322e-06, + "loss": 0.2304, + "step": 3220 + }, + { + "epoch": 18.18, + "grad_norm": 1.7872204780578613, + "learning_rate": 7.876404494382022e-06, + "loss": 0.1804, + "step": 3240 + }, + { + "epoch": 18.3, + "grad_norm": 2.0718905925750732, + "learning_rate": 7.801498127340823e-06, + "loss": 0.2232, + "step": 3260 + }, + { + "epoch": 18.41, + "grad_norm": 3.835952043533325, + "learning_rate": 7.726591760299626e-06, + "loss": 0.2171, + "step": 3280 + }, + { + "epoch": 18.52, + "grad_norm": 1.5925731658935547, + "learning_rate": 7.651685393258428e-06, + "loss": 0.1999, + "step": 3300 + }, + { + "epoch": 18.63, + "grad_norm": 2.434159994125366, + "learning_rate": 7.576779026217229e-06, + "loss": 0.1876, + "step": 3320 + }, + { + "epoch": 18.74, + "grad_norm": 2.3486499786376953, + "learning_rate": 7.5018726591760305e-06, + "loss": 0.21, + "step": 3340 + }, + { + "epoch": 18.86, + "grad_norm": 1.4824186563491821, + "learning_rate": 7.426966292134832e-06, + "loss": 0.2239, + "step": 3360 + }, + { + "epoch": 18.97, + "grad_norm": 2.062422275543213, + "learning_rate": 7.352059925093633e-06, + "loss": 0.22, + "step": 3380 + }, + { + "epoch": 19.08, + "grad_norm": 2.0563416481018066, + "learning_rate": 7.277153558052435e-06, + "loss": 0.1945, + "step": 3400 + }, + { + "epoch": 19.19, + "grad_norm": 1.6936135292053223, + "learning_rate": 7.202247191011237e-06, + "loss": 0.217, + "step": 3420 + }, + { + "epoch": 19.31, + "grad_norm": 1.9931917190551758, + "learning_rate": 7.127340823970038e-06, + "loss": 0.2127, + "step": 3440 + }, + { + "epoch": 19.42, + "grad_norm": 1.5989198684692383, + "learning_rate": 7.0524344569288395e-06, + "loss": 0.1849, + "step": 3460 + }, + { + "epoch": 19.53, + "grad_norm": 2.0073723793029785, + "learning_rate": 6.977528089887641e-06, + "loss": 0.1805, + "step": 3480 + }, + { + "epoch": 19.64, + "grad_norm": 1.9756735563278198, + "learning_rate": 6.902621722846442e-06, + "loss": 0.1963, + "step": 3500 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 9.0969472548864e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3500/training_args.bin b/checkpoint-3500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-3500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/checkpoint-4000/README.md b/checkpoint-4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-4000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-4000/adapter_config.json b/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-4000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4000/adapter_model.safetensors b/checkpoint-4000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e42c0b486cd987d9383ea68805cbb57fa6e8997c --- /dev/null +++ b/checkpoint-4000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0e9989cd91eb1fa0691a7666da95391ddf03c67eb9c3864cfdf00241040fd17 +size 16794200 diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..75fb2c01f484648dd21602a2ffa6d4e76b52092c --- /dev/null +++ b/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58ad30a7b86f4bc8c9e04ecd1ac8e7ec6dd6cd2bad2577f170f2434ea8ea5663 +size 33662074 diff --git a/checkpoint-4000/rng_state.pth b/checkpoint-4000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..394b2604783af393d12537e11e83fec3a9c32491 --- /dev/null +++ b/checkpoint-4000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19bb7967849c4d72c63cbf604e6d317ad1e9be261793935e5e0fde0e18dc34c8 +size 14244 diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f61cb18db2f3e6308e0611db1c9f226577fb3779 --- /dev/null +++ b/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a8f9a84d39a815b20c31c2af1f8699ecab5bb59dcc801c392fccbe40621173b +size 1064 diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dcb81540e337a0371ce4462d28a90372af660701 --- /dev/null +++ b/checkpoint-4000/trainer_state.json @@ -0,0 +1,1421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 22.448263767099263, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + }, + { + "epoch": 2.92, + "grad_norm": 3.929158926010132, + "learning_rate": 1.806367041198502e-05, + "loss": 0.6395, + "step": 520 + }, + { + "epoch": 3.03, + "grad_norm": 1.7316572666168213, + "learning_rate": 1.7988764044943823e-05, + "loss": 0.664, + "step": 540 + }, + { + "epoch": 3.14, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.7913857677902623e-05, + "loss": 0.6469, + "step": 560 + }, + { + "epoch": 3.25, + "grad_norm": 1.5258549451828003, + "learning_rate": 1.7838951310861426e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 3.37, + "grad_norm": 1.5486094951629639, + "learning_rate": 1.7764044943820226e-05, + "loss": 0.566, + "step": 600 + }, + { + "epoch": 3.48, + "grad_norm": 1.5657902956008911, + "learning_rate": 1.768913857677903e-05, + "loss": 0.6166, + "step": 620 + }, + { + "epoch": 3.59, + "grad_norm": 1.5971391201019287, + "learning_rate": 1.761423220973783e-05, + "loss": 0.5973, + "step": 640 + }, + { + "epoch": 3.7, + "grad_norm": 1.333030343055725, + "learning_rate": 1.753932584269663e-05, + "loss": 0.6117, + "step": 660 + }, + { + "epoch": 3.82, + "grad_norm": 1.4425445795059204, + "learning_rate": 1.746441947565543e-05, + "loss": 0.5702, + "step": 680 + }, + { + "epoch": 3.93, + "grad_norm": 1.4773032665252686, + "learning_rate": 1.7389513108614234e-05, + "loss": 0.5465, + "step": 700 + }, + { + "epoch": 4.04, + "grad_norm": 1.3328267335891724, + "learning_rate": 1.7314606741573034e-05, + "loss": 0.5379, + "step": 720 + }, + { + "epoch": 4.15, + "grad_norm": 1.6961455345153809, + "learning_rate": 1.7239700374531837e-05, + "loss": 0.5492, + "step": 740 + }, + { + "epoch": 4.27, + "grad_norm": 1.4636189937591553, + "learning_rate": 1.7164794007490637e-05, + "loss": 0.547, + "step": 760 + }, + { + "epoch": 4.38, + "grad_norm": 2.1686649322509766, + "learning_rate": 1.708988764044944e-05, + "loss": 0.5424, + "step": 780 + }, + { + "epoch": 4.49, + "grad_norm": 1.219388723373413, + "learning_rate": 1.7014981273408243e-05, + "loss": 0.5373, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 1.5566452741622925, + "learning_rate": 1.6940074906367042e-05, + "loss": 0.4944, + "step": 820 + }, + { + "epoch": 4.71, + "grad_norm": 1.598917841911316, + "learning_rate": 1.6865168539325845e-05, + "loss": 0.5036, + "step": 840 + }, + { + "epoch": 4.83, + "grad_norm": 1.5281039476394653, + "learning_rate": 1.6790262172284645e-05, + "loss": 0.5215, + "step": 860 + }, + { + "epoch": 4.94, + "grad_norm": 1.7123130559921265, + "learning_rate": 1.6715355805243448e-05, + "loss": 0.5362, + "step": 880 + }, + { + "epoch": 5.05, + "grad_norm": 1.543447732925415, + "learning_rate": 1.6640449438202248e-05, + "loss": 0.5379, + "step": 900 + }, + { + "epoch": 5.16, + "grad_norm": 2.4190192222595215, + "learning_rate": 1.656554307116105e-05, + "loss": 0.4921, + "step": 920 + }, + { + "epoch": 5.28, + "grad_norm": 2.190906047821045, + "learning_rate": 1.649063670411985e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 5.39, + "grad_norm": 2.113476514816284, + "learning_rate": 1.6415730337078653e-05, + "loss": 0.4914, + "step": 960 + }, + { + "epoch": 5.5, + "grad_norm": 1.8785656690597534, + "learning_rate": 1.6340823970037453e-05, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 5.61, + "grad_norm": 1.3745977878570557, + "learning_rate": 1.6265917602996256e-05, + "loss": 0.4697, + "step": 1000 + }, + { + "epoch": 5.72, + "grad_norm": 1.7874308824539185, + "learning_rate": 1.6191011235955056e-05, + "loss": 0.4625, + "step": 1020 + }, + { + "epoch": 5.84, + "grad_norm": 1.4448940753936768, + "learning_rate": 1.611610486891386e-05, + "loss": 0.4764, + "step": 1040 + }, + { + "epoch": 5.95, + "grad_norm": 2.278655767440796, + "learning_rate": 1.6041198501872662e-05, + "loss": 0.4221, + "step": 1060 + }, + { + "epoch": 6.06, + "grad_norm": 1.8602409362792969, + "learning_rate": 1.596629213483146e-05, + "loss": 0.4731, + "step": 1080 + }, + { + "epoch": 6.17, + "grad_norm": 1.884373426437378, + "learning_rate": 1.5891385767790265e-05, + "loss": 0.4241, + "step": 1100 + }, + { + "epoch": 6.29, + "grad_norm": 2.0259287357330322, + "learning_rate": 1.5816479400749064e-05, + "loss": 0.4368, + "step": 1120 + }, + { + "epoch": 6.4, + "grad_norm": 1.812462329864502, + "learning_rate": 1.5741573033707867e-05, + "loss": 0.442, + "step": 1140 + }, + { + "epoch": 6.51, + "grad_norm": 1.934327483177185, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.4195, + "step": 1160 + }, + { + "epoch": 6.62, + "grad_norm": 1.6152955293655396, + "learning_rate": 1.559176029962547e-05, + "loss": 0.4374, + "step": 1180 + }, + { + "epoch": 6.73, + "grad_norm": 2.7782068252563477, + "learning_rate": 1.551685393258427e-05, + "loss": 0.4231, + "step": 1200 + }, + { + "epoch": 6.85, + "grad_norm": 2.372976303100586, + "learning_rate": 1.5441947565543073e-05, + "loss": 0.444, + "step": 1220 + }, + { + "epoch": 6.96, + "grad_norm": 2.171353816986084, + "learning_rate": 1.5367041198501872e-05, + "loss": 0.4389, + "step": 1240 + }, + { + "epoch": 7.07, + "grad_norm": 1.3093984127044678, + "learning_rate": 1.5292134831460675e-05, + "loss": 0.4301, + "step": 1260 + }, + { + "epoch": 7.18, + "grad_norm": 2.267932176589966, + "learning_rate": 1.5217228464419478e-05, + "loss": 0.4046, + "step": 1280 + }, + { + "epoch": 7.3, + "grad_norm": 1.5326164960861206, + "learning_rate": 1.514232209737828e-05, + "loss": 0.4068, + "step": 1300 + }, + { + "epoch": 7.41, + "grad_norm": 3.1525979042053223, + "learning_rate": 1.5067415730337081e-05, + "loss": 0.3847, + "step": 1320 + }, + { + "epoch": 7.52, + "grad_norm": 2.081890106201172, + "learning_rate": 1.4992509363295882e-05, + "loss": 0.4126, + "step": 1340 + }, + { + "epoch": 7.63, + "grad_norm": 2.5701358318328857, + "learning_rate": 1.4917602996254684e-05, + "loss": 0.4065, + "step": 1360 + }, + { + "epoch": 7.74, + "grad_norm": 1.4190051555633545, + "learning_rate": 1.4842696629213485e-05, + "loss": 0.3979, + "step": 1380 + }, + { + "epoch": 7.86, + "grad_norm": 1.9085837602615356, + "learning_rate": 1.4767790262172286e-05, + "loss": 0.3894, + "step": 1400 + }, + { + "epoch": 7.97, + "grad_norm": 1.7573003768920898, + "learning_rate": 1.4692883895131088e-05, + "loss": 0.3751, + "step": 1420 + }, + { + "epoch": 8.08, + "grad_norm": 1.8974506855010986, + "learning_rate": 1.4617977528089889e-05, + "loss": 0.3936, + "step": 1440 + }, + { + "epoch": 8.19, + "grad_norm": 1.3843660354614258, + "learning_rate": 1.454307116104869e-05, + "loss": 0.3848, + "step": 1460 + }, + { + "epoch": 8.31, + "grad_norm": 1.525007724761963, + "learning_rate": 1.4468164794007492e-05, + "loss": 0.3552, + "step": 1480 + }, + { + "epoch": 8.42, + "grad_norm": 2.1665101051330566, + "learning_rate": 1.4393258426966291e-05, + "loss": 0.3547, + "step": 1500 + }, + { + "epoch": 8.53, + "grad_norm": 3.3614535331726074, + "learning_rate": 1.4318352059925096e-05, + "loss": 0.3771, + "step": 1520 + }, + { + "epoch": 8.64, + "grad_norm": 1.746299386024475, + "learning_rate": 1.4243445692883898e-05, + "loss": 0.396, + "step": 1540 + }, + { + "epoch": 8.75, + "grad_norm": 1.9144684076309204, + "learning_rate": 1.4168539325842699e-05, + "loss": 0.3748, + "step": 1560 + }, + { + "epoch": 8.87, + "grad_norm": 1.9617277383804321, + "learning_rate": 1.40936329588015e-05, + "loss": 0.3504, + "step": 1580 + }, + { + "epoch": 8.98, + "grad_norm": 2.69067645072937, + "learning_rate": 1.4018726591760302e-05, + "loss": 0.3477, + "step": 1600 + }, + { + "epoch": 9.09, + "grad_norm": 2.142008066177368, + "learning_rate": 1.3943820224719103e-05, + "loss": 0.3539, + "step": 1620 + }, + { + "epoch": 9.2, + "grad_norm": 1.7684266567230225, + "learning_rate": 1.3868913857677904e-05, + "loss": 0.3576, + "step": 1640 + }, + { + "epoch": 9.32, + "grad_norm": 1.4222275018692017, + "learning_rate": 1.3794007490636706e-05, + "loss": 0.3839, + "step": 1660 + }, + { + "epoch": 9.43, + "grad_norm": 2.0622501373291016, + "learning_rate": 1.3719101123595507e-05, + "loss": 0.3278, + "step": 1680 + }, + { + "epoch": 9.54, + "grad_norm": 1.639147400856018, + "learning_rate": 1.3644194756554308e-05, + "loss": 0.3374, + "step": 1700 + }, + { + "epoch": 9.65, + "grad_norm": 2.093045473098755, + "learning_rate": 1.356928838951311e-05, + "loss": 0.3535, + "step": 1720 + }, + { + "epoch": 9.76, + "grad_norm": 1.3492937088012695, + "learning_rate": 1.3494382022471911e-05, + "loss": 0.3105, + "step": 1740 + }, + { + "epoch": 9.88, + "grad_norm": 1.585205316543579, + "learning_rate": 1.3419475655430714e-05, + "loss": 0.3181, + "step": 1760 + }, + { + "epoch": 9.99, + "grad_norm": 2.8895344734191895, + "learning_rate": 1.3344569288389515e-05, + "loss": 0.3473, + "step": 1780 + }, + { + "epoch": 10.1, + "grad_norm": 1.7224748134613037, + "learning_rate": 1.3269662921348317e-05, + "loss": 0.3524, + "step": 1800 + }, + { + "epoch": 10.21, + "grad_norm": 2.1029868125915527, + "learning_rate": 1.3194756554307118e-05, + "loss": 0.3408, + "step": 1820 + }, + { + "epoch": 10.33, + "grad_norm": 2.434016227722168, + "learning_rate": 1.311985018726592e-05, + "loss": 0.3266, + "step": 1840 + }, + { + "epoch": 10.44, + "grad_norm": 1.953553318977356, + "learning_rate": 1.304494382022472e-05, + "loss": 0.2844, + "step": 1860 + }, + { + "epoch": 10.55, + "grad_norm": 2.5946218967437744, + "learning_rate": 1.2970037453183522e-05, + "loss": 0.3225, + "step": 1880 + }, + { + "epoch": 10.66, + "grad_norm": 2.5305733680725098, + "learning_rate": 1.2895131086142323e-05, + "loss": 0.3183, + "step": 1900 + }, + { + "epoch": 10.78, + "grad_norm": 3.56726336479187, + "learning_rate": 1.2820224719101125e-05, + "loss": 0.2944, + "step": 1920 + }, + { + "epoch": 10.89, + "grad_norm": 1.9687740802764893, + "learning_rate": 1.2745318352059926e-05, + "loss": 0.3411, + "step": 1940 + }, + { + "epoch": 11.0, + "grad_norm": 1.6027730703353882, + "learning_rate": 1.2670411985018727e-05, + "loss": 0.2949, + "step": 1960 + }, + { + "epoch": 11.11, + "grad_norm": 1.8739397525787354, + "learning_rate": 1.2595505617977529e-05, + "loss": 0.2716, + "step": 1980 + }, + { + "epoch": 11.22, + "grad_norm": 1.6741198301315308, + "learning_rate": 1.2520599250936332e-05, + "loss": 0.3334, + "step": 2000 + }, + { + "epoch": 11.34, + "grad_norm": 1.950945496559143, + "learning_rate": 1.2445692883895133e-05, + "loss": 0.3291, + "step": 2020 + }, + { + "epoch": 11.45, + "grad_norm": 1.9362170696258545, + "learning_rate": 1.2370786516853935e-05, + "loss": 0.2716, + "step": 2040 + }, + { + "epoch": 11.56, + "grad_norm": 1.6201746463775635, + "learning_rate": 1.2295880149812736e-05, + "loss": 0.2893, + "step": 2060 + }, + { + "epoch": 11.67, + "grad_norm": 3.488088607788086, + "learning_rate": 1.2220973782771537e-05, + "loss": 0.3239, + "step": 2080 + }, + { + "epoch": 11.79, + "grad_norm": 2.4608683586120605, + "learning_rate": 1.2146067415730339e-05, + "loss": 0.271, + "step": 2100 + }, + { + "epoch": 11.9, + "grad_norm": 1.5321098566055298, + "learning_rate": 1.207116104868914e-05, + "loss": 0.2876, + "step": 2120 + }, + { + "epoch": 12.01, + "grad_norm": 1.8334771394729614, + "learning_rate": 1.1996254681647941e-05, + "loss": 0.3066, + "step": 2140 + }, + { + "epoch": 12.12, + "grad_norm": 1.9506254196166992, + "learning_rate": 1.1921348314606743e-05, + "loss": 0.3023, + "step": 2160 + }, + { + "epoch": 12.23, + "grad_norm": 2.9073598384857178, + "learning_rate": 1.1846441947565544e-05, + "loss": 0.3152, + "step": 2180 + }, + { + "epoch": 12.35, + "grad_norm": 1.6023261547088623, + "learning_rate": 1.1771535580524345e-05, + "loss": 0.248, + "step": 2200 + }, + { + "epoch": 12.46, + "grad_norm": 1.7954633235931396, + "learning_rate": 1.1696629213483147e-05, + "loss": 0.2666, + "step": 2220 + }, + { + "epoch": 12.57, + "grad_norm": 2.0331828594207764, + "learning_rate": 1.162172284644195e-05, + "loss": 0.2878, + "step": 2240 + }, + { + "epoch": 12.68, + "grad_norm": 1.656420350074768, + "learning_rate": 1.1546816479400751e-05, + "loss": 0.2805, + "step": 2260 + }, + { + "epoch": 12.8, + "grad_norm": 1.5245873928070068, + "learning_rate": 1.1471910112359552e-05, + "loss": 0.2792, + "step": 2280 + }, + { + "epoch": 12.91, + "grad_norm": 2.6713974475860596, + "learning_rate": 1.1397003745318354e-05, + "loss": 0.2841, + "step": 2300 + }, + { + "epoch": 13.02, + "grad_norm": 1.268479347229004, + "learning_rate": 1.1322097378277155e-05, + "loss": 0.2708, + "step": 2320 + }, + { + "epoch": 13.13, + "grad_norm": 2.2990434169769287, + "learning_rate": 1.1247191011235956e-05, + "loss": 0.2649, + "step": 2340 + }, + { + "epoch": 13.24, + "grad_norm": 2.351956367492676, + "learning_rate": 1.1172284644194758e-05, + "loss": 0.281, + "step": 2360 + }, + { + "epoch": 13.36, + "grad_norm": 1.796783208847046, + "learning_rate": 1.1097378277153559e-05, + "loss": 0.2725, + "step": 2380 + }, + { + "epoch": 13.47, + "grad_norm": 1.7035847902297974, + "learning_rate": 1.102247191011236e-05, + "loss": 0.2799, + "step": 2400 + }, + { + "epoch": 13.58, + "grad_norm": 2.0395431518554688, + "learning_rate": 1.0947565543071162e-05, + "loss": 0.239, + "step": 2420 + }, + { + "epoch": 13.69, + "grad_norm": 1.8008232116699219, + "learning_rate": 1.0872659176029963e-05, + "loss": 0.2553, + "step": 2440 + }, + { + "epoch": 13.81, + "grad_norm": 2.0559043884277344, + "learning_rate": 1.0797752808988765e-05, + "loss": 0.2464, + "step": 2460 + }, + { + "epoch": 13.92, + "grad_norm": 1.8673292398452759, + "learning_rate": 1.0722846441947568e-05, + "loss": 0.2699, + "step": 2480 + }, + { + "epoch": 14.03, + "grad_norm": 1.6819398403167725, + "learning_rate": 1.0647940074906369e-05, + "loss": 0.2566, + "step": 2500 + }, + { + "epoch": 14.14, + "grad_norm": 1.9703686237335205, + "learning_rate": 1.057303370786517e-05, + "loss": 0.2807, + "step": 2520 + }, + { + "epoch": 14.25, + "grad_norm": 2.028834819793701, + "learning_rate": 1.0498127340823972e-05, + "loss": 0.2392, + "step": 2540 + }, + { + "epoch": 14.37, + "grad_norm": 2.2455177307128906, + "learning_rate": 1.0423220973782773e-05, + "loss": 0.247, + "step": 2560 + }, + { + "epoch": 14.48, + "grad_norm": 1.8078291416168213, + "learning_rate": 1.0348314606741574e-05, + "loss": 0.2552, + "step": 2580 + }, + { + "epoch": 14.59, + "grad_norm": 2.166729211807251, + "learning_rate": 1.0273408239700376e-05, + "loss": 0.2466, + "step": 2600 + }, + { + "epoch": 14.7, + "grad_norm": 2.710556745529175, + "learning_rate": 1.0198501872659177e-05, + "loss": 0.2506, + "step": 2620 + }, + { + "epoch": 14.82, + "grad_norm": 2.1344659328460693, + "learning_rate": 1.0123595505617978e-05, + "loss": 0.2388, + "step": 2640 + }, + { + "epoch": 14.93, + "grad_norm": 1.595842719078064, + "learning_rate": 1.004868913857678e-05, + "loss": 0.2553, + "step": 2660 + }, + { + "epoch": 15.04, + "grad_norm": 1.5458731651306152, + "learning_rate": 9.973782771535581e-06, + "loss": 0.2478, + "step": 2680 + }, + { + "epoch": 15.15, + "grad_norm": 1.9514356851577759, + "learning_rate": 9.898876404494382e-06, + "loss": 0.234, + "step": 2700 + }, + { + "epoch": 15.26, + "grad_norm": 2.1551694869995117, + "learning_rate": 9.823970037453184e-06, + "loss": 0.251, + "step": 2720 + }, + { + "epoch": 15.38, + "grad_norm": 2.08258318901062, + "learning_rate": 9.749063670411985e-06, + "loss": 0.2511, + "step": 2740 + }, + { + "epoch": 15.49, + "grad_norm": 1.581690788269043, + "learning_rate": 9.674157303370786e-06, + "loss": 0.2185, + "step": 2760 + }, + { + "epoch": 15.6, + "grad_norm": 2.2121975421905518, + "learning_rate": 9.599250936329588e-06, + "loss": 0.2161, + "step": 2780 + }, + { + "epoch": 15.71, + "grad_norm": 1.5077215433120728, + "learning_rate": 9.52434456928839e-06, + "loss": 0.2308, + "step": 2800 + }, + { + "epoch": 15.83, + "grad_norm": 2.57951021194458, + "learning_rate": 9.449438202247192e-06, + "loss": 0.2299, + "step": 2820 + }, + { + "epoch": 15.94, + "grad_norm": 1.6634414196014404, + "learning_rate": 9.374531835205993e-06, + "loss": 0.2576, + "step": 2840 + }, + { + "epoch": 16.05, + "grad_norm": 1.9692113399505615, + "learning_rate": 9.299625468164795e-06, + "loss": 0.2395, + "step": 2860 + }, + { + "epoch": 16.16, + "grad_norm": 1.9327415227890015, + "learning_rate": 9.224719101123596e-06, + "loss": 0.241, + "step": 2880 + }, + { + "epoch": 16.27, + "grad_norm": 1.7675727605819702, + "learning_rate": 9.149812734082398e-06, + "loss": 0.2201, + "step": 2900 + }, + { + "epoch": 16.39, + "grad_norm": 1.9511345624923706, + "learning_rate": 9.074906367041199e-06, + "loss": 0.2171, + "step": 2920 + }, + { + "epoch": 16.5, + "grad_norm": 1.7937383651733398, + "learning_rate": 9e-06, + "loss": 0.2286, + "step": 2940 + }, + { + "epoch": 16.61, + "grad_norm": 1.79076087474823, + "learning_rate": 8.925093632958802e-06, + "loss": 0.2479, + "step": 2960 + }, + { + "epoch": 16.72, + "grad_norm": 2.4045145511627197, + "learning_rate": 8.850187265917603e-06, + "loss": 0.2153, + "step": 2980 + }, + { + "epoch": 16.84, + "grad_norm": 2.1934499740600586, + "learning_rate": 8.775280898876404e-06, + "loss": 0.2361, + "step": 3000 + }, + { + "epoch": 16.95, + "grad_norm": 1.923170804977417, + "learning_rate": 8.700374531835206e-06, + "loss": 0.2146, + "step": 3020 + }, + { + "epoch": 17.06, + "grad_norm": 2.1610753536224365, + "learning_rate": 8.625468164794009e-06, + "loss": 0.2281, + "step": 3040 + }, + { + "epoch": 17.17, + "grad_norm": 2.1105706691741943, + "learning_rate": 8.55056179775281e-06, + "loss": 0.2403, + "step": 3060 + }, + { + "epoch": 17.29, + "grad_norm": 1.979177474975586, + "learning_rate": 8.475655430711611e-06, + "loss": 0.1734, + "step": 3080 + }, + { + "epoch": 17.4, + "grad_norm": 2.040055274963379, + "learning_rate": 8.400749063670413e-06, + "loss": 0.2393, + "step": 3100 + }, + { + "epoch": 17.51, + "grad_norm": 1.8687106370925903, + "learning_rate": 8.325842696629214e-06, + "loss": 0.2346, + "step": 3120 + }, + { + "epoch": 17.62, + "grad_norm": 1.7447230815887451, + "learning_rate": 8.250936329588015e-06, + "loss": 0.2279, + "step": 3140 + }, + { + "epoch": 17.73, + "grad_norm": 2.9035825729370117, + "learning_rate": 8.176029962546818e-06, + "loss": 0.2049, + "step": 3160 + }, + { + "epoch": 17.85, + "grad_norm": 2.1024608612060547, + "learning_rate": 8.101123595505618e-06, + "loss": 0.1962, + "step": 3180 + }, + { + "epoch": 17.96, + "grad_norm": 2.7913131713867188, + "learning_rate": 8.02621722846442e-06, + "loss": 0.2081, + "step": 3200 + }, + { + "epoch": 18.07, + "grad_norm": 2.0668814182281494, + "learning_rate": 7.95131086142322e-06, + "loss": 0.2304, + "step": 3220 + }, + { + "epoch": 18.18, + "grad_norm": 1.7872204780578613, + "learning_rate": 7.876404494382022e-06, + "loss": 0.1804, + "step": 3240 + }, + { + "epoch": 18.3, + "grad_norm": 2.0718905925750732, + "learning_rate": 7.801498127340823e-06, + "loss": 0.2232, + "step": 3260 + }, + { + "epoch": 18.41, + "grad_norm": 3.835952043533325, + "learning_rate": 7.726591760299626e-06, + "loss": 0.2171, + "step": 3280 + }, + { + "epoch": 18.52, + "grad_norm": 1.5925731658935547, + "learning_rate": 7.651685393258428e-06, + "loss": 0.1999, + "step": 3300 + }, + { + "epoch": 18.63, + "grad_norm": 2.434159994125366, + "learning_rate": 7.576779026217229e-06, + "loss": 0.1876, + "step": 3320 + }, + { + "epoch": 18.74, + "grad_norm": 2.3486499786376953, + "learning_rate": 7.5018726591760305e-06, + "loss": 0.21, + "step": 3340 + }, + { + "epoch": 18.86, + "grad_norm": 1.4824186563491821, + "learning_rate": 7.426966292134832e-06, + "loss": 0.2239, + "step": 3360 + }, + { + "epoch": 18.97, + "grad_norm": 2.062422275543213, + "learning_rate": 7.352059925093633e-06, + "loss": 0.22, + "step": 3380 + }, + { + "epoch": 19.08, + "grad_norm": 2.0563416481018066, + "learning_rate": 7.277153558052435e-06, + "loss": 0.1945, + "step": 3400 + }, + { + "epoch": 19.19, + "grad_norm": 1.6936135292053223, + "learning_rate": 7.202247191011237e-06, + "loss": 0.217, + "step": 3420 + }, + { + "epoch": 19.31, + "grad_norm": 1.9931917190551758, + "learning_rate": 7.127340823970038e-06, + "loss": 0.2127, + "step": 3440 + }, + { + "epoch": 19.42, + "grad_norm": 1.5989198684692383, + "learning_rate": 7.0524344569288395e-06, + "loss": 0.1849, + "step": 3460 + }, + { + "epoch": 19.53, + "grad_norm": 2.0073723793029785, + "learning_rate": 6.977528089887641e-06, + "loss": 0.1805, + "step": 3480 + }, + { + "epoch": 19.64, + "grad_norm": 1.9756735563278198, + "learning_rate": 6.902621722846442e-06, + "loss": 0.1963, + "step": 3500 + }, + { + "epoch": 19.75, + "grad_norm": 1.5112028121948242, + "learning_rate": 6.827715355805244e-06, + "loss": 0.2008, + "step": 3520 + }, + { + "epoch": 19.87, + "grad_norm": 2.2792975902557373, + "learning_rate": 6.752808988764046e-06, + "loss": 0.2106, + "step": 3540 + }, + { + "epoch": 19.98, + "grad_norm": 2.768470048904419, + "learning_rate": 6.677902621722847e-06, + "loss": 0.2066, + "step": 3560 + }, + { + "epoch": 20.09, + "grad_norm": 1.6916066408157349, + "learning_rate": 6.602996254681648e-06, + "loss": 0.1912, + "step": 3580 + }, + { + "epoch": 20.2, + "grad_norm": 1.7649778127670288, + "learning_rate": 6.52808988764045e-06, + "loss": 0.2027, + "step": 3600 + }, + { + "epoch": 20.32, + "grad_norm": 1.9743694067001343, + "learning_rate": 6.453183520599251e-06, + "loss": 0.2208, + "step": 3620 + }, + { + "epoch": 20.43, + "grad_norm": 1.827344298362732, + "learning_rate": 6.378277153558053e-06, + "loss": 0.1757, + "step": 3640 + }, + { + "epoch": 20.54, + "grad_norm": 2.7847957611083984, + "learning_rate": 6.303370786516855e-06, + "loss": 0.1931, + "step": 3660 + }, + { + "epoch": 20.65, + "grad_norm": 1.8572605848312378, + "learning_rate": 6.228464419475656e-06, + "loss": 0.1902, + "step": 3680 + }, + { + "epoch": 20.76, + "grad_norm": 1.5343818664550781, + "learning_rate": 6.153558052434457e-06, + "loss": 0.1916, + "step": 3700 + }, + { + "epoch": 20.88, + "grad_norm": 1.703688383102417, + "learning_rate": 6.078651685393259e-06, + "loss": 0.1897, + "step": 3720 + }, + { + "epoch": 20.99, + "grad_norm": 2.5442187786102295, + "learning_rate": 6.00374531835206e-06, + "loss": 0.1859, + "step": 3740 + }, + { + "epoch": 21.1, + "grad_norm": 2.0333402156829834, + "learning_rate": 5.928838951310862e-06, + "loss": 0.1632, + "step": 3760 + }, + { + "epoch": 21.21, + "grad_norm": 2.107227087020874, + "learning_rate": 5.8539325842696635e-06, + "loss": 0.2031, + "step": 3780 + }, + { + "epoch": 21.33, + "grad_norm": 2.0351223945617676, + "learning_rate": 5.779026217228465e-06, + "loss": 0.1759, + "step": 3800 + }, + { + "epoch": 21.44, + "grad_norm": 2.1328284740448, + "learning_rate": 5.704119850187266e-06, + "loss": 0.1853, + "step": 3820 + }, + { + "epoch": 21.55, + "grad_norm": 2.0145580768585205, + "learning_rate": 5.629213483146068e-06, + "loss": 0.1919, + "step": 3840 + }, + { + "epoch": 21.66, + "grad_norm": 1.8794372081756592, + "learning_rate": 5.554307116104869e-06, + "loss": 0.1958, + "step": 3860 + }, + { + "epoch": 21.77, + "grad_norm": 1.8487616777420044, + "learning_rate": 5.479400749063671e-06, + "loss": 0.207, + "step": 3880 + }, + { + "epoch": 21.89, + "grad_norm": 2.080965042114258, + "learning_rate": 5.4044943820224725e-06, + "loss": 0.1715, + "step": 3900 + }, + { + "epoch": 22.0, + "grad_norm": 2.0303232669830322, + "learning_rate": 5.329588014981274e-06, + "loss": 0.1873, + "step": 3920 + }, + { + "epoch": 22.11, + "grad_norm": 2.1078438758850098, + "learning_rate": 5.254681647940075e-06, + "loss": 0.1869, + "step": 3940 + }, + { + "epoch": 22.22, + "grad_norm": 1.8502501249313354, + "learning_rate": 5.1797752808988765e-06, + "loss": 0.1983, + "step": 3960 + }, + { + "epoch": 22.34, + "grad_norm": 2.209162950515747, + "learning_rate": 5.104868913857678e-06, + "loss": 0.185, + "step": 3980 + }, + { + "epoch": 22.45, + "grad_norm": 1.9525928497314453, + "learning_rate": 5.02996254681648e-06, + "loss": 0.1943, + "step": 4000 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 1.03965111484416e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/checkpoint-4500/README.md b/checkpoint-4500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-4500/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-4500/adapter_config.json b/checkpoint-4500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-4500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4500/adapter_model.safetensors b/checkpoint-4500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..adf997d95b5551d397b4be691063510f179df1b5 --- /dev/null +++ b/checkpoint-4500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4682358f8f3e059f2df07a7de36eb8c30febb9acbfa32c4833b50c708bbba0dc +size 16794200 diff --git a/checkpoint-4500/optimizer.pt b/checkpoint-4500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0d63ec1927b65bccc0588ad1da1a65cca495c5d --- /dev/null +++ b/checkpoint-4500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83498e378d8b6ff0b19454e4e1ab3fe69d2b73cf7ca64e9942bc1b7e6a7526b4 +size 33662074 diff --git a/checkpoint-4500/rng_state.pth b/checkpoint-4500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..44cc81e1137248da4430a89fcdff2098a54fb72b --- /dev/null +++ b/checkpoint-4500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67273e96104b1717ac8ddaf0dffaac65e3efe560dafeca4d15d22097de99d575 +size 14244 diff --git a/checkpoint-4500/scheduler.pt b/checkpoint-4500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cef32bbb592b5678a4c3cd9e966f2488653fd085 --- /dev/null +++ b/checkpoint-4500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10155ded04413c5ab02c30ce7bcef09d70271ae55ca2ed07741c665afd0c60d1 +size 1064 diff --git a/checkpoint-4500/trainer_state.json b/checkpoint-4500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b4f71ab99cef4ff026a7ec3cfcde7256db284076 --- /dev/null +++ b/checkpoint-4500/trainer_state.json @@ -0,0 +1,1596 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 25.25429673798667, + "eval_steps": 500, + "global_step": 4500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + }, + { + "epoch": 2.92, + "grad_norm": 3.929158926010132, + "learning_rate": 1.806367041198502e-05, + "loss": 0.6395, + "step": 520 + }, + { + "epoch": 3.03, + "grad_norm": 1.7316572666168213, + "learning_rate": 1.7988764044943823e-05, + "loss": 0.664, + "step": 540 + }, + { + "epoch": 3.14, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.7913857677902623e-05, + "loss": 0.6469, + "step": 560 + }, + { + "epoch": 3.25, + "grad_norm": 1.5258549451828003, + "learning_rate": 1.7838951310861426e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 3.37, + "grad_norm": 1.5486094951629639, + "learning_rate": 1.7764044943820226e-05, + "loss": 0.566, + "step": 600 + }, + { + "epoch": 3.48, + "grad_norm": 1.5657902956008911, + "learning_rate": 1.768913857677903e-05, + "loss": 0.6166, + "step": 620 + }, + { + "epoch": 3.59, + "grad_norm": 1.5971391201019287, + "learning_rate": 1.761423220973783e-05, + "loss": 0.5973, + "step": 640 + }, + { + "epoch": 3.7, + "grad_norm": 1.333030343055725, + "learning_rate": 1.753932584269663e-05, + "loss": 0.6117, + "step": 660 + }, + { + "epoch": 3.82, + "grad_norm": 1.4425445795059204, + "learning_rate": 1.746441947565543e-05, + "loss": 0.5702, + "step": 680 + }, + { + "epoch": 3.93, + "grad_norm": 1.4773032665252686, + "learning_rate": 1.7389513108614234e-05, + "loss": 0.5465, + "step": 700 + }, + { + "epoch": 4.04, + "grad_norm": 1.3328267335891724, + "learning_rate": 1.7314606741573034e-05, + "loss": 0.5379, + "step": 720 + }, + { + "epoch": 4.15, + "grad_norm": 1.6961455345153809, + "learning_rate": 1.7239700374531837e-05, + "loss": 0.5492, + "step": 740 + }, + { + "epoch": 4.27, + "grad_norm": 1.4636189937591553, + "learning_rate": 1.7164794007490637e-05, + "loss": 0.547, + "step": 760 + }, + { + "epoch": 4.38, + "grad_norm": 2.1686649322509766, + "learning_rate": 1.708988764044944e-05, + "loss": 0.5424, + "step": 780 + }, + { + "epoch": 4.49, + "grad_norm": 1.219388723373413, + "learning_rate": 1.7014981273408243e-05, + "loss": 0.5373, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 1.5566452741622925, + "learning_rate": 1.6940074906367042e-05, + "loss": 0.4944, + "step": 820 + }, + { + "epoch": 4.71, + "grad_norm": 1.598917841911316, + "learning_rate": 1.6865168539325845e-05, + "loss": 0.5036, + "step": 840 + }, + { + "epoch": 4.83, + "grad_norm": 1.5281039476394653, + "learning_rate": 1.6790262172284645e-05, + "loss": 0.5215, + "step": 860 + }, + { + "epoch": 4.94, + "grad_norm": 1.7123130559921265, + "learning_rate": 1.6715355805243448e-05, + "loss": 0.5362, + "step": 880 + }, + { + "epoch": 5.05, + "grad_norm": 1.543447732925415, + "learning_rate": 1.6640449438202248e-05, + "loss": 0.5379, + "step": 900 + }, + { + "epoch": 5.16, + "grad_norm": 2.4190192222595215, + "learning_rate": 1.656554307116105e-05, + "loss": 0.4921, + "step": 920 + }, + { + "epoch": 5.28, + "grad_norm": 2.190906047821045, + "learning_rate": 1.649063670411985e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 5.39, + "grad_norm": 2.113476514816284, + "learning_rate": 1.6415730337078653e-05, + "loss": 0.4914, + "step": 960 + }, + { + "epoch": 5.5, + "grad_norm": 1.8785656690597534, + "learning_rate": 1.6340823970037453e-05, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 5.61, + "grad_norm": 1.3745977878570557, + "learning_rate": 1.6265917602996256e-05, + "loss": 0.4697, + "step": 1000 + }, + { + "epoch": 5.72, + "grad_norm": 1.7874308824539185, + "learning_rate": 1.6191011235955056e-05, + "loss": 0.4625, + "step": 1020 + }, + { + "epoch": 5.84, + "grad_norm": 1.4448940753936768, + "learning_rate": 1.611610486891386e-05, + "loss": 0.4764, + "step": 1040 + }, + { + "epoch": 5.95, + "grad_norm": 2.278655767440796, + "learning_rate": 1.6041198501872662e-05, + "loss": 0.4221, + "step": 1060 + }, + { + "epoch": 6.06, + "grad_norm": 1.8602409362792969, + "learning_rate": 1.596629213483146e-05, + "loss": 0.4731, + "step": 1080 + }, + { + "epoch": 6.17, + "grad_norm": 1.884373426437378, + "learning_rate": 1.5891385767790265e-05, + "loss": 0.4241, + "step": 1100 + }, + { + "epoch": 6.29, + "grad_norm": 2.0259287357330322, + "learning_rate": 1.5816479400749064e-05, + "loss": 0.4368, + "step": 1120 + }, + { + "epoch": 6.4, + "grad_norm": 1.812462329864502, + "learning_rate": 1.5741573033707867e-05, + "loss": 0.442, + "step": 1140 + }, + { + "epoch": 6.51, + "grad_norm": 1.934327483177185, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.4195, + "step": 1160 + }, + { + "epoch": 6.62, + "grad_norm": 1.6152955293655396, + "learning_rate": 1.559176029962547e-05, + "loss": 0.4374, + "step": 1180 + }, + { + "epoch": 6.73, + "grad_norm": 2.7782068252563477, + "learning_rate": 1.551685393258427e-05, + "loss": 0.4231, + "step": 1200 + }, + { + "epoch": 6.85, + "grad_norm": 2.372976303100586, + "learning_rate": 1.5441947565543073e-05, + "loss": 0.444, + "step": 1220 + }, + { + "epoch": 6.96, + "grad_norm": 2.171353816986084, + "learning_rate": 1.5367041198501872e-05, + "loss": 0.4389, + "step": 1240 + }, + { + "epoch": 7.07, + "grad_norm": 1.3093984127044678, + "learning_rate": 1.5292134831460675e-05, + "loss": 0.4301, + "step": 1260 + }, + { + "epoch": 7.18, + "grad_norm": 2.267932176589966, + "learning_rate": 1.5217228464419478e-05, + "loss": 0.4046, + "step": 1280 + }, + { + "epoch": 7.3, + "grad_norm": 1.5326164960861206, + "learning_rate": 1.514232209737828e-05, + "loss": 0.4068, + "step": 1300 + }, + { + "epoch": 7.41, + "grad_norm": 3.1525979042053223, + "learning_rate": 1.5067415730337081e-05, + "loss": 0.3847, + "step": 1320 + }, + { + "epoch": 7.52, + "grad_norm": 2.081890106201172, + "learning_rate": 1.4992509363295882e-05, + "loss": 0.4126, + "step": 1340 + }, + { + "epoch": 7.63, + "grad_norm": 2.5701358318328857, + "learning_rate": 1.4917602996254684e-05, + "loss": 0.4065, + "step": 1360 + }, + { + "epoch": 7.74, + "grad_norm": 1.4190051555633545, + "learning_rate": 1.4842696629213485e-05, + "loss": 0.3979, + "step": 1380 + }, + { + "epoch": 7.86, + "grad_norm": 1.9085837602615356, + "learning_rate": 1.4767790262172286e-05, + "loss": 0.3894, + "step": 1400 + }, + { + "epoch": 7.97, + "grad_norm": 1.7573003768920898, + "learning_rate": 1.4692883895131088e-05, + "loss": 0.3751, + "step": 1420 + }, + { + "epoch": 8.08, + "grad_norm": 1.8974506855010986, + "learning_rate": 1.4617977528089889e-05, + "loss": 0.3936, + "step": 1440 + }, + { + "epoch": 8.19, + "grad_norm": 1.3843660354614258, + "learning_rate": 1.454307116104869e-05, + "loss": 0.3848, + "step": 1460 + }, + { + "epoch": 8.31, + "grad_norm": 1.525007724761963, + "learning_rate": 1.4468164794007492e-05, + "loss": 0.3552, + "step": 1480 + }, + { + "epoch": 8.42, + "grad_norm": 2.1665101051330566, + "learning_rate": 1.4393258426966291e-05, + "loss": 0.3547, + "step": 1500 + }, + { + "epoch": 8.53, + "grad_norm": 3.3614535331726074, + "learning_rate": 1.4318352059925096e-05, + "loss": 0.3771, + "step": 1520 + }, + { + "epoch": 8.64, + "grad_norm": 1.746299386024475, + "learning_rate": 1.4243445692883898e-05, + "loss": 0.396, + "step": 1540 + }, + { + "epoch": 8.75, + "grad_norm": 1.9144684076309204, + "learning_rate": 1.4168539325842699e-05, + "loss": 0.3748, + "step": 1560 + }, + { + "epoch": 8.87, + "grad_norm": 1.9617277383804321, + "learning_rate": 1.40936329588015e-05, + "loss": 0.3504, + "step": 1580 + }, + { + "epoch": 8.98, + "grad_norm": 2.69067645072937, + "learning_rate": 1.4018726591760302e-05, + "loss": 0.3477, + "step": 1600 + }, + { + "epoch": 9.09, + "grad_norm": 2.142008066177368, + "learning_rate": 1.3943820224719103e-05, + "loss": 0.3539, + "step": 1620 + }, + { + "epoch": 9.2, + "grad_norm": 1.7684266567230225, + "learning_rate": 1.3868913857677904e-05, + "loss": 0.3576, + "step": 1640 + }, + { + "epoch": 9.32, + "grad_norm": 1.4222275018692017, + "learning_rate": 1.3794007490636706e-05, + "loss": 0.3839, + "step": 1660 + }, + { + "epoch": 9.43, + "grad_norm": 2.0622501373291016, + "learning_rate": 1.3719101123595507e-05, + "loss": 0.3278, + "step": 1680 + }, + { + "epoch": 9.54, + "grad_norm": 1.639147400856018, + "learning_rate": 1.3644194756554308e-05, + "loss": 0.3374, + "step": 1700 + }, + { + "epoch": 9.65, + "grad_norm": 2.093045473098755, + "learning_rate": 1.356928838951311e-05, + "loss": 0.3535, + "step": 1720 + }, + { + "epoch": 9.76, + "grad_norm": 1.3492937088012695, + "learning_rate": 1.3494382022471911e-05, + "loss": 0.3105, + "step": 1740 + }, + { + "epoch": 9.88, + "grad_norm": 1.585205316543579, + "learning_rate": 1.3419475655430714e-05, + "loss": 0.3181, + "step": 1760 + }, + { + "epoch": 9.99, + "grad_norm": 2.8895344734191895, + "learning_rate": 1.3344569288389515e-05, + "loss": 0.3473, + "step": 1780 + }, + { + "epoch": 10.1, + "grad_norm": 1.7224748134613037, + "learning_rate": 1.3269662921348317e-05, + "loss": 0.3524, + "step": 1800 + }, + { + "epoch": 10.21, + "grad_norm": 2.1029868125915527, + "learning_rate": 1.3194756554307118e-05, + "loss": 0.3408, + "step": 1820 + }, + { + "epoch": 10.33, + "grad_norm": 2.434016227722168, + "learning_rate": 1.311985018726592e-05, + "loss": 0.3266, + "step": 1840 + }, + { + "epoch": 10.44, + "grad_norm": 1.953553318977356, + "learning_rate": 1.304494382022472e-05, + "loss": 0.2844, + "step": 1860 + }, + { + "epoch": 10.55, + "grad_norm": 2.5946218967437744, + "learning_rate": 1.2970037453183522e-05, + "loss": 0.3225, + "step": 1880 + }, + { + "epoch": 10.66, + "grad_norm": 2.5305733680725098, + "learning_rate": 1.2895131086142323e-05, + "loss": 0.3183, + "step": 1900 + }, + { + "epoch": 10.78, + "grad_norm": 3.56726336479187, + "learning_rate": 1.2820224719101125e-05, + "loss": 0.2944, + "step": 1920 + }, + { + "epoch": 10.89, + "grad_norm": 1.9687740802764893, + "learning_rate": 1.2745318352059926e-05, + "loss": 0.3411, + "step": 1940 + }, + { + "epoch": 11.0, + "grad_norm": 1.6027730703353882, + "learning_rate": 1.2670411985018727e-05, + "loss": 0.2949, + "step": 1960 + }, + { + "epoch": 11.11, + "grad_norm": 1.8739397525787354, + "learning_rate": 1.2595505617977529e-05, + "loss": 0.2716, + "step": 1980 + }, + { + "epoch": 11.22, + "grad_norm": 1.6741198301315308, + "learning_rate": 1.2520599250936332e-05, + "loss": 0.3334, + "step": 2000 + }, + { + "epoch": 11.34, + "grad_norm": 1.950945496559143, + "learning_rate": 1.2445692883895133e-05, + "loss": 0.3291, + "step": 2020 + }, + { + "epoch": 11.45, + "grad_norm": 1.9362170696258545, + "learning_rate": 1.2370786516853935e-05, + "loss": 0.2716, + "step": 2040 + }, + { + "epoch": 11.56, + "grad_norm": 1.6201746463775635, + "learning_rate": 1.2295880149812736e-05, + "loss": 0.2893, + "step": 2060 + }, + { + "epoch": 11.67, + "grad_norm": 3.488088607788086, + "learning_rate": 1.2220973782771537e-05, + "loss": 0.3239, + "step": 2080 + }, + { + "epoch": 11.79, + "grad_norm": 2.4608683586120605, + "learning_rate": 1.2146067415730339e-05, + "loss": 0.271, + "step": 2100 + }, + { + "epoch": 11.9, + "grad_norm": 1.5321098566055298, + "learning_rate": 1.207116104868914e-05, + "loss": 0.2876, + "step": 2120 + }, + { + "epoch": 12.01, + "grad_norm": 1.8334771394729614, + "learning_rate": 1.1996254681647941e-05, + "loss": 0.3066, + "step": 2140 + }, + { + "epoch": 12.12, + "grad_norm": 1.9506254196166992, + "learning_rate": 1.1921348314606743e-05, + "loss": 0.3023, + "step": 2160 + }, + { + "epoch": 12.23, + "grad_norm": 2.9073598384857178, + "learning_rate": 1.1846441947565544e-05, + "loss": 0.3152, + "step": 2180 + }, + { + "epoch": 12.35, + "grad_norm": 1.6023261547088623, + "learning_rate": 1.1771535580524345e-05, + "loss": 0.248, + "step": 2200 + }, + { + "epoch": 12.46, + "grad_norm": 1.7954633235931396, + "learning_rate": 1.1696629213483147e-05, + "loss": 0.2666, + "step": 2220 + }, + { + "epoch": 12.57, + "grad_norm": 2.0331828594207764, + "learning_rate": 1.162172284644195e-05, + "loss": 0.2878, + "step": 2240 + }, + { + "epoch": 12.68, + "grad_norm": 1.656420350074768, + "learning_rate": 1.1546816479400751e-05, + "loss": 0.2805, + "step": 2260 + }, + { + "epoch": 12.8, + "grad_norm": 1.5245873928070068, + "learning_rate": 1.1471910112359552e-05, + "loss": 0.2792, + "step": 2280 + }, + { + "epoch": 12.91, + "grad_norm": 2.6713974475860596, + "learning_rate": 1.1397003745318354e-05, + "loss": 0.2841, + "step": 2300 + }, + { + "epoch": 13.02, + "grad_norm": 1.268479347229004, + "learning_rate": 1.1322097378277155e-05, + "loss": 0.2708, + "step": 2320 + }, + { + "epoch": 13.13, + "grad_norm": 2.2990434169769287, + "learning_rate": 1.1247191011235956e-05, + "loss": 0.2649, + "step": 2340 + }, + { + "epoch": 13.24, + "grad_norm": 2.351956367492676, + "learning_rate": 1.1172284644194758e-05, + "loss": 0.281, + "step": 2360 + }, + { + "epoch": 13.36, + "grad_norm": 1.796783208847046, + "learning_rate": 1.1097378277153559e-05, + "loss": 0.2725, + "step": 2380 + }, + { + "epoch": 13.47, + "grad_norm": 1.7035847902297974, + "learning_rate": 1.102247191011236e-05, + "loss": 0.2799, + "step": 2400 + }, + { + "epoch": 13.58, + "grad_norm": 2.0395431518554688, + "learning_rate": 1.0947565543071162e-05, + "loss": 0.239, + "step": 2420 + }, + { + "epoch": 13.69, + "grad_norm": 1.8008232116699219, + "learning_rate": 1.0872659176029963e-05, + "loss": 0.2553, + "step": 2440 + }, + { + "epoch": 13.81, + "grad_norm": 2.0559043884277344, + "learning_rate": 1.0797752808988765e-05, + "loss": 0.2464, + "step": 2460 + }, + { + "epoch": 13.92, + "grad_norm": 1.8673292398452759, + "learning_rate": 1.0722846441947568e-05, + "loss": 0.2699, + "step": 2480 + }, + { + "epoch": 14.03, + "grad_norm": 1.6819398403167725, + "learning_rate": 1.0647940074906369e-05, + "loss": 0.2566, + "step": 2500 + }, + { + "epoch": 14.14, + "grad_norm": 1.9703686237335205, + "learning_rate": 1.057303370786517e-05, + "loss": 0.2807, + "step": 2520 + }, + { + "epoch": 14.25, + "grad_norm": 2.028834819793701, + "learning_rate": 1.0498127340823972e-05, + "loss": 0.2392, + "step": 2540 + }, + { + "epoch": 14.37, + "grad_norm": 2.2455177307128906, + "learning_rate": 1.0423220973782773e-05, + "loss": 0.247, + "step": 2560 + }, + { + "epoch": 14.48, + "grad_norm": 1.8078291416168213, + "learning_rate": 1.0348314606741574e-05, + "loss": 0.2552, + "step": 2580 + }, + { + "epoch": 14.59, + "grad_norm": 2.166729211807251, + "learning_rate": 1.0273408239700376e-05, + "loss": 0.2466, + "step": 2600 + }, + { + "epoch": 14.7, + "grad_norm": 2.710556745529175, + "learning_rate": 1.0198501872659177e-05, + "loss": 0.2506, + "step": 2620 + }, + { + "epoch": 14.82, + "grad_norm": 2.1344659328460693, + "learning_rate": 1.0123595505617978e-05, + "loss": 0.2388, + "step": 2640 + }, + { + "epoch": 14.93, + "grad_norm": 1.595842719078064, + "learning_rate": 1.004868913857678e-05, + "loss": 0.2553, + "step": 2660 + }, + { + "epoch": 15.04, + "grad_norm": 1.5458731651306152, + "learning_rate": 9.973782771535581e-06, + "loss": 0.2478, + "step": 2680 + }, + { + "epoch": 15.15, + "grad_norm": 1.9514356851577759, + "learning_rate": 9.898876404494382e-06, + "loss": 0.234, + "step": 2700 + }, + { + "epoch": 15.26, + "grad_norm": 2.1551694869995117, + "learning_rate": 9.823970037453184e-06, + "loss": 0.251, + "step": 2720 + }, + { + "epoch": 15.38, + "grad_norm": 2.08258318901062, + "learning_rate": 9.749063670411985e-06, + "loss": 0.2511, + "step": 2740 + }, + { + "epoch": 15.49, + "grad_norm": 1.581690788269043, + "learning_rate": 9.674157303370786e-06, + "loss": 0.2185, + "step": 2760 + }, + { + "epoch": 15.6, + "grad_norm": 2.2121975421905518, + "learning_rate": 9.599250936329588e-06, + "loss": 0.2161, + "step": 2780 + }, + { + "epoch": 15.71, + "grad_norm": 1.5077215433120728, + "learning_rate": 9.52434456928839e-06, + "loss": 0.2308, + "step": 2800 + }, + { + "epoch": 15.83, + "grad_norm": 2.57951021194458, + "learning_rate": 9.449438202247192e-06, + "loss": 0.2299, + "step": 2820 + }, + { + "epoch": 15.94, + "grad_norm": 1.6634414196014404, + "learning_rate": 9.374531835205993e-06, + "loss": 0.2576, + "step": 2840 + }, + { + "epoch": 16.05, + "grad_norm": 1.9692113399505615, + "learning_rate": 9.299625468164795e-06, + "loss": 0.2395, + "step": 2860 + }, + { + "epoch": 16.16, + "grad_norm": 1.9327415227890015, + "learning_rate": 9.224719101123596e-06, + "loss": 0.241, + "step": 2880 + }, + { + "epoch": 16.27, + "grad_norm": 1.7675727605819702, + "learning_rate": 9.149812734082398e-06, + "loss": 0.2201, + "step": 2900 + }, + { + "epoch": 16.39, + "grad_norm": 1.9511345624923706, + "learning_rate": 9.074906367041199e-06, + "loss": 0.2171, + "step": 2920 + }, + { + "epoch": 16.5, + "grad_norm": 1.7937383651733398, + "learning_rate": 9e-06, + "loss": 0.2286, + "step": 2940 + }, + { + "epoch": 16.61, + "grad_norm": 1.79076087474823, + "learning_rate": 8.925093632958802e-06, + "loss": 0.2479, + "step": 2960 + }, + { + "epoch": 16.72, + "grad_norm": 2.4045145511627197, + "learning_rate": 8.850187265917603e-06, + "loss": 0.2153, + "step": 2980 + }, + { + "epoch": 16.84, + "grad_norm": 2.1934499740600586, + "learning_rate": 8.775280898876404e-06, + "loss": 0.2361, + "step": 3000 + }, + { + "epoch": 16.95, + "grad_norm": 1.923170804977417, + "learning_rate": 8.700374531835206e-06, + "loss": 0.2146, + "step": 3020 + }, + { + "epoch": 17.06, + "grad_norm": 2.1610753536224365, + "learning_rate": 8.625468164794009e-06, + "loss": 0.2281, + "step": 3040 + }, + { + "epoch": 17.17, + "grad_norm": 2.1105706691741943, + "learning_rate": 8.55056179775281e-06, + "loss": 0.2403, + "step": 3060 + }, + { + "epoch": 17.29, + "grad_norm": 1.979177474975586, + "learning_rate": 8.475655430711611e-06, + "loss": 0.1734, + "step": 3080 + }, + { + "epoch": 17.4, + "grad_norm": 2.040055274963379, + "learning_rate": 8.400749063670413e-06, + "loss": 0.2393, + "step": 3100 + }, + { + "epoch": 17.51, + "grad_norm": 1.8687106370925903, + "learning_rate": 8.325842696629214e-06, + "loss": 0.2346, + "step": 3120 + }, + { + "epoch": 17.62, + "grad_norm": 1.7447230815887451, + "learning_rate": 8.250936329588015e-06, + "loss": 0.2279, + "step": 3140 + }, + { + "epoch": 17.73, + "grad_norm": 2.9035825729370117, + "learning_rate": 8.176029962546818e-06, + "loss": 0.2049, + "step": 3160 + }, + { + "epoch": 17.85, + "grad_norm": 2.1024608612060547, + "learning_rate": 8.101123595505618e-06, + "loss": 0.1962, + "step": 3180 + }, + { + "epoch": 17.96, + "grad_norm": 2.7913131713867188, + "learning_rate": 8.02621722846442e-06, + "loss": 0.2081, + "step": 3200 + }, + { + "epoch": 18.07, + "grad_norm": 2.0668814182281494, + "learning_rate": 7.95131086142322e-06, + "loss": 0.2304, + "step": 3220 + }, + { + "epoch": 18.18, + "grad_norm": 1.7872204780578613, + "learning_rate": 7.876404494382022e-06, + "loss": 0.1804, + "step": 3240 + }, + { + "epoch": 18.3, + "grad_norm": 2.0718905925750732, + "learning_rate": 7.801498127340823e-06, + "loss": 0.2232, + "step": 3260 + }, + { + "epoch": 18.41, + "grad_norm": 3.835952043533325, + "learning_rate": 7.726591760299626e-06, + "loss": 0.2171, + "step": 3280 + }, + { + "epoch": 18.52, + "grad_norm": 1.5925731658935547, + "learning_rate": 7.651685393258428e-06, + "loss": 0.1999, + "step": 3300 + }, + { + "epoch": 18.63, + "grad_norm": 2.434159994125366, + "learning_rate": 7.576779026217229e-06, + "loss": 0.1876, + "step": 3320 + }, + { + "epoch": 18.74, + "grad_norm": 2.3486499786376953, + "learning_rate": 7.5018726591760305e-06, + "loss": 0.21, + "step": 3340 + }, + { + "epoch": 18.86, + "grad_norm": 1.4824186563491821, + "learning_rate": 7.426966292134832e-06, + "loss": 0.2239, + "step": 3360 + }, + { + "epoch": 18.97, + "grad_norm": 2.062422275543213, + "learning_rate": 7.352059925093633e-06, + "loss": 0.22, + "step": 3380 + }, + { + "epoch": 19.08, + "grad_norm": 2.0563416481018066, + "learning_rate": 7.277153558052435e-06, + "loss": 0.1945, + "step": 3400 + }, + { + "epoch": 19.19, + "grad_norm": 1.6936135292053223, + "learning_rate": 7.202247191011237e-06, + "loss": 0.217, + "step": 3420 + }, + { + "epoch": 19.31, + "grad_norm": 1.9931917190551758, + "learning_rate": 7.127340823970038e-06, + "loss": 0.2127, + "step": 3440 + }, + { + "epoch": 19.42, + "grad_norm": 1.5989198684692383, + "learning_rate": 7.0524344569288395e-06, + "loss": 0.1849, + "step": 3460 + }, + { + "epoch": 19.53, + "grad_norm": 2.0073723793029785, + "learning_rate": 6.977528089887641e-06, + "loss": 0.1805, + "step": 3480 + }, + { + "epoch": 19.64, + "grad_norm": 1.9756735563278198, + "learning_rate": 6.902621722846442e-06, + "loss": 0.1963, + "step": 3500 + }, + { + "epoch": 19.75, + "grad_norm": 1.5112028121948242, + "learning_rate": 6.827715355805244e-06, + "loss": 0.2008, + "step": 3520 + }, + { + "epoch": 19.87, + "grad_norm": 2.2792975902557373, + "learning_rate": 6.752808988764046e-06, + "loss": 0.2106, + "step": 3540 + }, + { + "epoch": 19.98, + "grad_norm": 2.768470048904419, + "learning_rate": 6.677902621722847e-06, + "loss": 0.2066, + "step": 3560 + }, + { + "epoch": 20.09, + "grad_norm": 1.6916066408157349, + "learning_rate": 6.602996254681648e-06, + "loss": 0.1912, + "step": 3580 + }, + { + "epoch": 20.2, + "grad_norm": 1.7649778127670288, + "learning_rate": 6.52808988764045e-06, + "loss": 0.2027, + "step": 3600 + }, + { + "epoch": 20.32, + "grad_norm": 1.9743694067001343, + "learning_rate": 6.453183520599251e-06, + "loss": 0.2208, + "step": 3620 + }, + { + "epoch": 20.43, + "grad_norm": 1.827344298362732, + "learning_rate": 6.378277153558053e-06, + "loss": 0.1757, + "step": 3640 + }, + { + "epoch": 20.54, + "grad_norm": 2.7847957611083984, + "learning_rate": 6.303370786516855e-06, + "loss": 0.1931, + "step": 3660 + }, + { + "epoch": 20.65, + "grad_norm": 1.8572605848312378, + "learning_rate": 6.228464419475656e-06, + "loss": 0.1902, + "step": 3680 + }, + { + "epoch": 20.76, + "grad_norm": 1.5343818664550781, + "learning_rate": 6.153558052434457e-06, + "loss": 0.1916, + "step": 3700 + }, + { + "epoch": 20.88, + "grad_norm": 1.703688383102417, + "learning_rate": 6.078651685393259e-06, + "loss": 0.1897, + "step": 3720 + }, + { + "epoch": 20.99, + "grad_norm": 2.5442187786102295, + "learning_rate": 6.00374531835206e-06, + "loss": 0.1859, + "step": 3740 + }, + { + "epoch": 21.1, + "grad_norm": 2.0333402156829834, + "learning_rate": 5.928838951310862e-06, + "loss": 0.1632, + "step": 3760 + }, + { + "epoch": 21.21, + "grad_norm": 2.107227087020874, + "learning_rate": 5.8539325842696635e-06, + "loss": 0.2031, + "step": 3780 + }, + { + "epoch": 21.33, + "grad_norm": 2.0351223945617676, + "learning_rate": 5.779026217228465e-06, + "loss": 0.1759, + "step": 3800 + }, + { + "epoch": 21.44, + "grad_norm": 2.1328284740448, + "learning_rate": 5.704119850187266e-06, + "loss": 0.1853, + "step": 3820 + }, + { + "epoch": 21.55, + "grad_norm": 2.0145580768585205, + "learning_rate": 5.629213483146068e-06, + "loss": 0.1919, + "step": 3840 + }, + { + "epoch": 21.66, + "grad_norm": 1.8794372081756592, + "learning_rate": 5.554307116104869e-06, + "loss": 0.1958, + "step": 3860 + }, + { + "epoch": 21.77, + "grad_norm": 1.8487616777420044, + "learning_rate": 5.479400749063671e-06, + "loss": 0.207, + "step": 3880 + }, + { + "epoch": 21.89, + "grad_norm": 2.080965042114258, + "learning_rate": 5.4044943820224725e-06, + "loss": 0.1715, + "step": 3900 + }, + { + "epoch": 22.0, + "grad_norm": 2.0303232669830322, + "learning_rate": 5.329588014981274e-06, + "loss": 0.1873, + "step": 3920 + }, + { + "epoch": 22.11, + "grad_norm": 2.1078438758850098, + "learning_rate": 5.254681647940075e-06, + "loss": 0.1869, + "step": 3940 + }, + { + "epoch": 22.22, + "grad_norm": 1.8502501249313354, + "learning_rate": 5.1797752808988765e-06, + "loss": 0.1983, + "step": 3960 + }, + { + "epoch": 22.34, + "grad_norm": 2.209162950515747, + "learning_rate": 5.104868913857678e-06, + "loss": 0.185, + "step": 3980 + }, + { + "epoch": 22.45, + "grad_norm": 1.9525928497314453, + "learning_rate": 5.02996254681648e-06, + "loss": 0.1943, + "step": 4000 + }, + { + "epoch": 22.56, + "grad_norm": 3.0655415058135986, + "learning_rate": 4.955056179775281e-06, + "loss": 0.1738, + "step": 4020 + }, + { + "epoch": 22.67, + "grad_norm": 1.4932396411895752, + "learning_rate": 4.880149812734083e-06, + "loss": 0.1775, + "step": 4040 + }, + { + "epoch": 22.78, + "grad_norm": 2.076929807662964, + "learning_rate": 4.805243445692884e-06, + "loss": 0.1834, + "step": 4060 + }, + { + "epoch": 22.9, + "grad_norm": 1.8913358449935913, + "learning_rate": 4.7303370786516854e-06, + "loss": 0.1601, + "step": 4080 + }, + { + "epoch": 23.01, + "grad_norm": 1.9011529684066772, + "learning_rate": 4.655430711610488e-06, + "loss": 0.1812, + "step": 4100 + }, + { + "epoch": 23.12, + "grad_norm": 1.6414529085159302, + "learning_rate": 4.580524344569289e-06, + "loss": 0.1716, + "step": 4120 + }, + { + "epoch": 23.23, + "grad_norm": 3.7525336742401123, + "learning_rate": 4.50561797752809e-06, + "loss": 0.1795, + "step": 4140 + }, + { + "epoch": 23.35, + "grad_norm": 1.912279486656189, + "learning_rate": 4.430711610486892e-06, + "loss": 0.1906, + "step": 4160 + }, + { + "epoch": 23.46, + "grad_norm": 1.9044945240020752, + "learning_rate": 4.355805243445693e-06, + "loss": 0.1614, + "step": 4180 + }, + { + "epoch": 23.57, + "grad_norm": 2.0123000144958496, + "learning_rate": 4.280898876404494e-06, + "loss": 0.1852, + "step": 4200 + }, + { + "epoch": 23.68, + "grad_norm": 1.8307185173034668, + "learning_rate": 4.2059925093632965e-06, + "loss": 0.1785, + "step": 4220 + }, + { + "epoch": 23.8, + "grad_norm": 3.2872562408447266, + "learning_rate": 4.131086142322098e-06, + "loss": 0.17, + "step": 4240 + }, + { + "epoch": 23.91, + "grad_norm": 1.623343825340271, + "learning_rate": 4.056179775280899e-06, + "loss": 0.1774, + "step": 4260 + }, + { + "epoch": 24.02, + "grad_norm": 1.9367610216140747, + "learning_rate": 3.981273408239701e-06, + "loss": 0.1809, + "step": 4280 + }, + { + "epoch": 24.13, + "grad_norm": 2.030416250228882, + "learning_rate": 3.906367041198502e-06, + "loss": 0.177, + "step": 4300 + }, + { + "epoch": 24.24, + "grad_norm": 2.2556800842285156, + "learning_rate": 3.831460674157303e-06, + "loss": 0.1846, + "step": 4320 + }, + { + "epoch": 24.36, + "grad_norm": 2.8906972408294678, + "learning_rate": 3.7565543071161055e-06, + "loss": 0.1837, + "step": 4340 + }, + { + "epoch": 24.47, + "grad_norm": 1.7102136611938477, + "learning_rate": 3.681647940074907e-06, + "loss": 0.1684, + "step": 4360 + }, + { + "epoch": 24.58, + "grad_norm": 1.753233551979065, + "learning_rate": 3.606741573033708e-06, + "loss": 0.1667, + "step": 4380 + }, + { + "epoch": 24.69, + "grad_norm": 1.9974724054336548, + "learning_rate": 3.53183520599251e-06, + "loss": 0.1732, + "step": 4400 + }, + { + "epoch": 24.81, + "grad_norm": 1.5714670419692993, + "learning_rate": 3.4569288389513113e-06, + "loss": 0.1703, + "step": 4420 + }, + { + "epoch": 24.92, + "grad_norm": 2.6254587173461914, + "learning_rate": 3.3820224719101126e-06, + "loss": 0.1603, + "step": 4440 + }, + { + "epoch": 25.03, + "grad_norm": 2.4333715438842773, + "learning_rate": 3.3071161048689144e-06, + "loss": 0.1669, + "step": 4460 + }, + { + "epoch": 25.14, + "grad_norm": 2.230980157852173, + "learning_rate": 3.2322097378277157e-06, + "loss": 0.1655, + "step": 4480 + }, + { + "epoch": 25.25, + "grad_norm": 1.845352292060852, + "learning_rate": 3.157303370786517e-06, + "loss": 0.1942, + "step": 4500 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 1.16960750419968e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4500/training_args.bin b/checkpoint-4500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-4500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/checkpoint-500/README.md b/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..71cb9c44d2509feb5b1444af5430bc78d8b77982 --- /dev/null +++ b/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f599cbd8e5418f6b5249e3394bbcd74548940a5c18e341889da9675b3fd5dbe +size 16794200 diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a25a9ab2ba1f341ebc9465783b8d88da45e02adc --- /dev/null +++ b/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f28ffeb5f73ec34e7d2d6dc8e4c8139efff1bcc8e4bad38eeb6431b4db1ad66 +size 33662074 diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c1c55cc859759c919070f3341431df182979ebc --- /dev/null +++ b/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fd0df77a022fbd90cb5fd2ccc8756cdb01bc22b99face134b422d313348ac5b +size 14244 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..338bf1e83b6b3c7e099f79f6b9567d129a48fa8d --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f82e86e4196dc00b5636a417ab4d417518e16c61c7dd133e3267e267f2aab4f +size 1064 diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2c584e4610dfd598bc08d33364cbd28ad63f95c3 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,196 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.806032970887408, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 1.2995638935552e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/checkpoint-5000/README.md b/checkpoint-5000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb638a279dc16df665e7cd491c7ffb8571ba6cd7 --- /dev/null +++ b/checkpoint-5000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: sallywww/Llama-7B +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.1.dev0 \ No newline at end of file diff --git a/checkpoint-5000/adapter_config.json b/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c344d276888d0aeec2576a19010354f5a57d9c9 --- /dev/null +++ b/checkpoint-5000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "sallywww/Llama-7B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5000/adapter_model.safetensors b/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a27a6f4f4c6aaa31302537212a27ea0323beb397 --- /dev/null +++ b/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df96320995045a8c5cdc856784399965e6ba3c305f219490e78de6d753a1b47f +size 16794200 diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9bb33d6eae8bd4246b1c7ef72c6ec31019008c3 --- /dev/null +++ b/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd6f97052cad17f108342d983cdf5396ec54014953d8aff0b97d559c50bfd21 +size 33662074 diff --git a/checkpoint-5000/rng_state.pth b/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..647e3e40e3ddc6d7ce18014b3f5b0822b8d4eff8 --- /dev/null +++ b/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:346ae105c779f584d5a277c6ed3b5d9bcb025fd39495fdda257255c2b7e21021 +size 14244 diff --git a/checkpoint-5000/scheduler.pt b/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2480cb906df2c1578064d66d381176a8ded1653a --- /dev/null +++ b/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eda1c5de431f8e37cb194431b437d55338ef2ab1bd0f92407ad10131b63c6eba +size 1064 diff --git a/checkpoint-5000/trainer_state.json b/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a79e1159d8590cd30e758f718e25f97c0c167ac9 --- /dev/null +++ b/checkpoint-5000/trainer_state.json @@ -0,0 +1,1771 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 28.06032970887408, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11, + "grad_norm": 0.5909375548362732, + "learning_rate": 1.9932584269662923e-05, + "loss": 2.0237, + "step": 20 + }, + { + "epoch": 0.22, + "grad_norm": 0.5826025009155273, + "learning_rate": 1.9857677902621722e-05, + "loss": 1.9306, + "step": 40 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491089820861816, + "learning_rate": 1.9782771535580525e-05, + "loss": 1.7959, + "step": 60 + }, + { + "epoch": 0.45, + "grad_norm": 1.362810730934143, + "learning_rate": 1.970786516853933e-05, + "loss": 1.6599, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 1.4427486658096313, + "learning_rate": 1.963295880149813e-05, + "loss": 1.5685, + "step": 100 + }, + { + "epoch": 0.67, + "grad_norm": 0.9993659257888794, + "learning_rate": 1.956179775280899e-05, + "loss": 1.4621, + "step": 120 + }, + { + "epoch": 0.79, + "grad_norm": 1.614562749862671, + "learning_rate": 1.9486891385767793e-05, + "loss": 1.31, + "step": 140 + }, + { + "epoch": 0.9, + "grad_norm": 1.1975798606872559, + "learning_rate": 1.9411985018726593e-05, + "loss": 1.2322, + "step": 160 + }, + { + "epoch": 1.01, + "grad_norm": 0.7684128880500793, + "learning_rate": 1.9337078651685396e-05, + "loss": 1.1361, + "step": 180 + }, + { + "epoch": 1.12, + "grad_norm": 0.9336960911750793, + "learning_rate": 1.9262172284644195e-05, + "loss": 1.0797, + "step": 200 + }, + { + "epoch": 1.23, + "grad_norm": 0.8471770882606506, + "learning_rate": 1.9187265917603e-05, + "loss": 1.0368, + "step": 220 + }, + { + "epoch": 1.35, + "grad_norm": 1.111340045928955, + "learning_rate": 1.9112359550561798e-05, + "loss": 0.9738, + "step": 240 + }, + { + "epoch": 1.46, + "grad_norm": 0.8093781471252441, + "learning_rate": 1.90374531835206e-05, + "loss": 0.9494, + "step": 260 + }, + { + "epoch": 1.57, + "grad_norm": 0.8438062071800232, + "learning_rate": 1.89625468164794e-05, + "loss": 0.9276, + "step": 280 + }, + { + "epoch": 1.68, + "grad_norm": 0.9896701574325562, + "learning_rate": 1.8887640449438204e-05, + "loss": 0.8656, + "step": 300 + }, + { + "epoch": 1.8, + "grad_norm": 0.8278244137763977, + "learning_rate": 1.8812734082397007e-05, + "loss": 0.8431, + "step": 320 + }, + { + "epoch": 1.91, + "grad_norm": 0.931291937828064, + "learning_rate": 1.8737827715355807e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 2.02, + "grad_norm": 1.21769380569458, + "learning_rate": 1.866292134831461e-05, + "loss": 0.7647, + "step": 360 + }, + { + "epoch": 2.13, + "grad_norm": 3.5183286666870117, + "learning_rate": 1.858801498127341e-05, + "loss": 0.7497, + "step": 380 + }, + { + "epoch": 2.24, + "grad_norm": 1.1153030395507812, + "learning_rate": 1.8513108614232212e-05, + "loss": 0.7507, + "step": 400 + }, + { + "epoch": 2.36, + "grad_norm": 1.0140526294708252, + "learning_rate": 1.8438202247191012e-05, + "loss": 0.7415, + "step": 420 + }, + { + "epoch": 2.47, + "grad_norm": 1.4395232200622559, + "learning_rate": 1.8363295880149815e-05, + "loss": 0.6947, + "step": 440 + }, + { + "epoch": 2.58, + "grad_norm": 1.4253089427947998, + "learning_rate": 1.8288389513108615e-05, + "loss": 0.7429, + "step": 460 + }, + { + "epoch": 2.69, + "grad_norm": 1.3152351379394531, + "learning_rate": 1.8213483146067418e-05, + "loss": 0.7363, + "step": 480 + }, + { + "epoch": 2.81, + "grad_norm": 2.5935957431793213, + "learning_rate": 1.8138576779026217e-05, + "loss": 0.6486, + "step": 500 + }, + { + "epoch": 2.92, + "grad_norm": 3.929158926010132, + "learning_rate": 1.806367041198502e-05, + "loss": 0.6395, + "step": 520 + }, + { + "epoch": 3.03, + "grad_norm": 1.7316572666168213, + "learning_rate": 1.7988764044943823e-05, + "loss": 0.664, + "step": 540 + }, + { + "epoch": 3.14, + "grad_norm": 1.3388841152191162, + "learning_rate": 1.7913857677902623e-05, + "loss": 0.6469, + "step": 560 + }, + { + "epoch": 3.25, + "grad_norm": 1.5258549451828003, + "learning_rate": 1.7838951310861426e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 3.37, + "grad_norm": 1.5486094951629639, + "learning_rate": 1.7764044943820226e-05, + "loss": 0.566, + "step": 600 + }, + { + "epoch": 3.48, + "grad_norm": 1.5657902956008911, + "learning_rate": 1.768913857677903e-05, + "loss": 0.6166, + "step": 620 + }, + { + "epoch": 3.59, + "grad_norm": 1.5971391201019287, + "learning_rate": 1.761423220973783e-05, + "loss": 0.5973, + "step": 640 + }, + { + "epoch": 3.7, + "grad_norm": 1.333030343055725, + "learning_rate": 1.753932584269663e-05, + "loss": 0.6117, + "step": 660 + }, + { + "epoch": 3.82, + "grad_norm": 1.4425445795059204, + "learning_rate": 1.746441947565543e-05, + "loss": 0.5702, + "step": 680 + }, + { + "epoch": 3.93, + "grad_norm": 1.4773032665252686, + "learning_rate": 1.7389513108614234e-05, + "loss": 0.5465, + "step": 700 + }, + { + "epoch": 4.04, + "grad_norm": 1.3328267335891724, + "learning_rate": 1.7314606741573034e-05, + "loss": 0.5379, + "step": 720 + }, + { + "epoch": 4.15, + "grad_norm": 1.6961455345153809, + "learning_rate": 1.7239700374531837e-05, + "loss": 0.5492, + "step": 740 + }, + { + "epoch": 4.27, + "grad_norm": 1.4636189937591553, + "learning_rate": 1.7164794007490637e-05, + "loss": 0.547, + "step": 760 + }, + { + "epoch": 4.38, + "grad_norm": 2.1686649322509766, + "learning_rate": 1.708988764044944e-05, + "loss": 0.5424, + "step": 780 + }, + { + "epoch": 4.49, + "grad_norm": 1.219388723373413, + "learning_rate": 1.7014981273408243e-05, + "loss": 0.5373, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 1.5566452741622925, + "learning_rate": 1.6940074906367042e-05, + "loss": 0.4944, + "step": 820 + }, + { + "epoch": 4.71, + "grad_norm": 1.598917841911316, + "learning_rate": 1.6865168539325845e-05, + "loss": 0.5036, + "step": 840 + }, + { + "epoch": 4.83, + "grad_norm": 1.5281039476394653, + "learning_rate": 1.6790262172284645e-05, + "loss": 0.5215, + "step": 860 + }, + { + "epoch": 4.94, + "grad_norm": 1.7123130559921265, + "learning_rate": 1.6715355805243448e-05, + "loss": 0.5362, + "step": 880 + }, + { + "epoch": 5.05, + "grad_norm": 1.543447732925415, + "learning_rate": 1.6640449438202248e-05, + "loss": 0.5379, + "step": 900 + }, + { + "epoch": 5.16, + "grad_norm": 2.4190192222595215, + "learning_rate": 1.656554307116105e-05, + "loss": 0.4921, + "step": 920 + }, + { + "epoch": 5.28, + "grad_norm": 2.190906047821045, + "learning_rate": 1.649063670411985e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 5.39, + "grad_norm": 2.113476514816284, + "learning_rate": 1.6415730337078653e-05, + "loss": 0.4914, + "step": 960 + }, + { + "epoch": 5.5, + "grad_norm": 1.8785656690597534, + "learning_rate": 1.6340823970037453e-05, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 5.61, + "grad_norm": 1.3745977878570557, + "learning_rate": 1.6265917602996256e-05, + "loss": 0.4697, + "step": 1000 + }, + { + "epoch": 5.72, + "grad_norm": 1.7874308824539185, + "learning_rate": 1.6191011235955056e-05, + "loss": 0.4625, + "step": 1020 + }, + { + "epoch": 5.84, + "grad_norm": 1.4448940753936768, + "learning_rate": 1.611610486891386e-05, + "loss": 0.4764, + "step": 1040 + }, + { + "epoch": 5.95, + "grad_norm": 2.278655767440796, + "learning_rate": 1.6041198501872662e-05, + "loss": 0.4221, + "step": 1060 + }, + { + "epoch": 6.06, + "grad_norm": 1.8602409362792969, + "learning_rate": 1.596629213483146e-05, + "loss": 0.4731, + "step": 1080 + }, + { + "epoch": 6.17, + "grad_norm": 1.884373426437378, + "learning_rate": 1.5891385767790265e-05, + "loss": 0.4241, + "step": 1100 + }, + { + "epoch": 6.29, + "grad_norm": 2.0259287357330322, + "learning_rate": 1.5816479400749064e-05, + "loss": 0.4368, + "step": 1120 + }, + { + "epoch": 6.4, + "grad_norm": 1.812462329864502, + "learning_rate": 1.5741573033707867e-05, + "loss": 0.442, + "step": 1140 + }, + { + "epoch": 6.51, + "grad_norm": 1.934327483177185, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.4195, + "step": 1160 + }, + { + "epoch": 6.62, + "grad_norm": 1.6152955293655396, + "learning_rate": 1.559176029962547e-05, + "loss": 0.4374, + "step": 1180 + }, + { + "epoch": 6.73, + "grad_norm": 2.7782068252563477, + "learning_rate": 1.551685393258427e-05, + "loss": 0.4231, + "step": 1200 + }, + { + "epoch": 6.85, + "grad_norm": 2.372976303100586, + "learning_rate": 1.5441947565543073e-05, + "loss": 0.444, + "step": 1220 + }, + { + "epoch": 6.96, + "grad_norm": 2.171353816986084, + "learning_rate": 1.5367041198501872e-05, + "loss": 0.4389, + "step": 1240 + }, + { + "epoch": 7.07, + "grad_norm": 1.3093984127044678, + "learning_rate": 1.5292134831460675e-05, + "loss": 0.4301, + "step": 1260 + }, + { + "epoch": 7.18, + "grad_norm": 2.267932176589966, + "learning_rate": 1.5217228464419478e-05, + "loss": 0.4046, + "step": 1280 + }, + { + "epoch": 7.3, + "grad_norm": 1.5326164960861206, + "learning_rate": 1.514232209737828e-05, + "loss": 0.4068, + "step": 1300 + }, + { + "epoch": 7.41, + "grad_norm": 3.1525979042053223, + "learning_rate": 1.5067415730337081e-05, + "loss": 0.3847, + "step": 1320 + }, + { + "epoch": 7.52, + "grad_norm": 2.081890106201172, + "learning_rate": 1.4992509363295882e-05, + "loss": 0.4126, + "step": 1340 + }, + { + "epoch": 7.63, + "grad_norm": 2.5701358318328857, + "learning_rate": 1.4917602996254684e-05, + "loss": 0.4065, + "step": 1360 + }, + { + "epoch": 7.74, + "grad_norm": 1.4190051555633545, + "learning_rate": 1.4842696629213485e-05, + "loss": 0.3979, + "step": 1380 + }, + { + "epoch": 7.86, + "grad_norm": 1.9085837602615356, + "learning_rate": 1.4767790262172286e-05, + "loss": 0.3894, + "step": 1400 + }, + { + "epoch": 7.97, + "grad_norm": 1.7573003768920898, + "learning_rate": 1.4692883895131088e-05, + "loss": 0.3751, + "step": 1420 + }, + { + "epoch": 8.08, + "grad_norm": 1.8974506855010986, + "learning_rate": 1.4617977528089889e-05, + "loss": 0.3936, + "step": 1440 + }, + { + "epoch": 8.19, + "grad_norm": 1.3843660354614258, + "learning_rate": 1.454307116104869e-05, + "loss": 0.3848, + "step": 1460 + }, + { + "epoch": 8.31, + "grad_norm": 1.525007724761963, + "learning_rate": 1.4468164794007492e-05, + "loss": 0.3552, + "step": 1480 + }, + { + "epoch": 8.42, + "grad_norm": 2.1665101051330566, + "learning_rate": 1.4393258426966291e-05, + "loss": 0.3547, + "step": 1500 + }, + { + "epoch": 8.53, + "grad_norm": 3.3614535331726074, + "learning_rate": 1.4318352059925096e-05, + "loss": 0.3771, + "step": 1520 + }, + { + "epoch": 8.64, + "grad_norm": 1.746299386024475, + "learning_rate": 1.4243445692883898e-05, + "loss": 0.396, + "step": 1540 + }, + { + "epoch": 8.75, + "grad_norm": 1.9144684076309204, + "learning_rate": 1.4168539325842699e-05, + "loss": 0.3748, + "step": 1560 + }, + { + "epoch": 8.87, + "grad_norm": 1.9617277383804321, + "learning_rate": 1.40936329588015e-05, + "loss": 0.3504, + "step": 1580 + }, + { + "epoch": 8.98, + "grad_norm": 2.69067645072937, + "learning_rate": 1.4018726591760302e-05, + "loss": 0.3477, + "step": 1600 + }, + { + "epoch": 9.09, + "grad_norm": 2.142008066177368, + "learning_rate": 1.3943820224719103e-05, + "loss": 0.3539, + "step": 1620 + }, + { + "epoch": 9.2, + "grad_norm": 1.7684266567230225, + "learning_rate": 1.3868913857677904e-05, + "loss": 0.3576, + "step": 1640 + }, + { + "epoch": 9.32, + "grad_norm": 1.4222275018692017, + "learning_rate": 1.3794007490636706e-05, + "loss": 0.3839, + "step": 1660 + }, + { + "epoch": 9.43, + "grad_norm": 2.0622501373291016, + "learning_rate": 1.3719101123595507e-05, + "loss": 0.3278, + "step": 1680 + }, + { + "epoch": 9.54, + "grad_norm": 1.639147400856018, + "learning_rate": 1.3644194756554308e-05, + "loss": 0.3374, + "step": 1700 + }, + { + "epoch": 9.65, + "grad_norm": 2.093045473098755, + "learning_rate": 1.356928838951311e-05, + "loss": 0.3535, + "step": 1720 + }, + { + "epoch": 9.76, + "grad_norm": 1.3492937088012695, + "learning_rate": 1.3494382022471911e-05, + "loss": 0.3105, + "step": 1740 + }, + { + "epoch": 9.88, + "grad_norm": 1.585205316543579, + "learning_rate": 1.3419475655430714e-05, + "loss": 0.3181, + "step": 1760 + }, + { + "epoch": 9.99, + "grad_norm": 2.8895344734191895, + "learning_rate": 1.3344569288389515e-05, + "loss": 0.3473, + "step": 1780 + }, + { + "epoch": 10.1, + "grad_norm": 1.7224748134613037, + "learning_rate": 1.3269662921348317e-05, + "loss": 0.3524, + "step": 1800 + }, + { + "epoch": 10.21, + "grad_norm": 2.1029868125915527, + "learning_rate": 1.3194756554307118e-05, + "loss": 0.3408, + "step": 1820 + }, + { + "epoch": 10.33, + "grad_norm": 2.434016227722168, + "learning_rate": 1.311985018726592e-05, + "loss": 0.3266, + "step": 1840 + }, + { + "epoch": 10.44, + "grad_norm": 1.953553318977356, + "learning_rate": 1.304494382022472e-05, + "loss": 0.2844, + "step": 1860 + }, + { + "epoch": 10.55, + "grad_norm": 2.5946218967437744, + "learning_rate": 1.2970037453183522e-05, + "loss": 0.3225, + "step": 1880 + }, + { + "epoch": 10.66, + "grad_norm": 2.5305733680725098, + "learning_rate": 1.2895131086142323e-05, + "loss": 0.3183, + "step": 1900 + }, + { + "epoch": 10.78, + "grad_norm": 3.56726336479187, + "learning_rate": 1.2820224719101125e-05, + "loss": 0.2944, + "step": 1920 + }, + { + "epoch": 10.89, + "grad_norm": 1.9687740802764893, + "learning_rate": 1.2745318352059926e-05, + "loss": 0.3411, + "step": 1940 + }, + { + "epoch": 11.0, + "grad_norm": 1.6027730703353882, + "learning_rate": 1.2670411985018727e-05, + "loss": 0.2949, + "step": 1960 + }, + { + "epoch": 11.11, + "grad_norm": 1.8739397525787354, + "learning_rate": 1.2595505617977529e-05, + "loss": 0.2716, + "step": 1980 + }, + { + "epoch": 11.22, + "grad_norm": 1.6741198301315308, + "learning_rate": 1.2520599250936332e-05, + "loss": 0.3334, + "step": 2000 + }, + { + "epoch": 11.34, + "grad_norm": 1.950945496559143, + "learning_rate": 1.2445692883895133e-05, + "loss": 0.3291, + "step": 2020 + }, + { + "epoch": 11.45, + "grad_norm": 1.9362170696258545, + "learning_rate": 1.2370786516853935e-05, + "loss": 0.2716, + "step": 2040 + }, + { + "epoch": 11.56, + "grad_norm": 1.6201746463775635, + "learning_rate": 1.2295880149812736e-05, + "loss": 0.2893, + "step": 2060 + }, + { + "epoch": 11.67, + "grad_norm": 3.488088607788086, + "learning_rate": 1.2220973782771537e-05, + "loss": 0.3239, + "step": 2080 + }, + { + "epoch": 11.79, + "grad_norm": 2.4608683586120605, + "learning_rate": 1.2146067415730339e-05, + "loss": 0.271, + "step": 2100 + }, + { + "epoch": 11.9, + "grad_norm": 1.5321098566055298, + "learning_rate": 1.207116104868914e-05, + "loss": 0.2876, + "step": 2120 + }, + { + "epoch": 12.01, + "grad_norm": 1.8334771394729614, + "learning_rate": 1.1996254681647941e-05, + "loss": 0.3066, + "step": 2140 + }, + { + "epoch": 12.12, + "grad_norm": 1.9506254196166992, + "learning_rate": 1.1921348314606743e-05, + "loss": 0.3023, + "step": 2160 + }, + { + "epoch": 12.23, + "grad_norm": 2.9073598384857178, + "learning_rate": 1.1846441947565544e-05, + "loss": 0.3152, + "step": 2180 + }, + { + "epoch": 12.35, + "grad_norm": 1.6023261547088623, + "learning_rate": 1.1771535580524345e-05, + "loss": 0.248, + "step": 2200 + }, + { + "epoch": 12.46, + "grad_norm": 1.7954633235931396, + "learning_rate": 1.1696629213483147e-05, + "loss": 0.2666, + "step": 2220 + }, + { + "epoch": 12.57, + "grad_norm": 2.0331828594207764, + "learning_rate": 1.162172284644195e-05, + "loss": 0.2878, + "step": 2240 + }, + { + "epoch": 12.68, + "grad_norm": 1.656420350074768, + "learning_rate": 1.1546816479400751e-05, + "loss": 0.2805, + "step": 2260 + }, + { + "epoch": 12.8, + "grad_norm": 1.5245873928070068, + "learning_rate": 1.1471910112359552e-05, + "loss": 0.2792, + "step": 2280 + }, + { + "epoch": 12.91, + "grad_norm": 2.6713974475860596, + "learning_rate": 1.1397003745318354e-05, + "loss": 0.2841, + "step": 2300 + }, + { + "epoch": 13.02, + "grad_norm": 1.268479347229004, + "learning_rate": 1.1322097378277155e-05, + "loss": 0.2708, + "step": 2320 + }, + { + "epoch": 13.13, + "grad_norm": 2.2990434169769287, + "learning_rate": 1.1247191011235956e-05, + "loss": 0.2649, + "step": 2340 + }, + { + "epoch": 13.24, + "grad_norm": 2.351956367492676, + "learning_rate": 1.1172284644194758e-05, + "loss": 0.281, + "step": 2360 + }, + { + "epoch": 13.36, + "grad_norm": 1.796783208847046, + "learning_rate": 1.1097378277153559e-05, + "loss": 0.2725, + "step": 2380 + }, + { + "epoch": 13.47, + "grad_norm": 1.7035847902297974, + "learning_rate": 1.102247191011236e-05, + "loss": 0.2799, + "step": 2400 + }, + { + "epoch": 13.58, + "grad_norm": 2.0395431518554688, + "learning_rate": 1.0947565543071162e-05, + "loss": 0.239, + "step": 2420 + }, + { + "epoch": 13.69, + "grad_norm": 1.8008232116699219, + "learning_rate": 1.0872659176029963e-05, + "loss": 0.2553, + "step": 2440 + }, + { + "epoch": 13.81, + "grad_norm": 2.0559043884277344, + "learning_rate": 1.0797752808988765e-05, + "loss": 0.2464, + "step": 2460 + }, + { + "epoch": 13.92, + "grad_norm": 1.8673292398452759, + "learning_rate": 1.0722846441947568e-05, + "loss": 0.2699, + "step": 2480 + }, + { + "epoch": 14.03, + "grad_norm": 1.6819398403167725, + "learning_rate": 1.0647940074906369e-05, + "loss": 0.2566, + "step": 2500 + }, + { + "epoch": 14.14, + "grad_norm": 1.9703686237335205, + "learning_rate": 1.057303370786517e-05, + "loss": 0.2807, + "step": 2520 + }, + { + "epoch": 14.25, + "grad_norm": 2.028834819793701, + "learning_rate": 1.0498127340823972e-05, + "loss": 0.2392, + "step": 2540 + }, + { + "epoch": 14.37, + "grad_norm": 2.2455177307128906, + "learning_rate": 1.0423220973782773e-05, + "loss": 0.247, + "step": 2560 + }, + { + "epoch": 14.48, + "grad_norm": 1.8078291416168213, + "learning_rate": 1.0348314606741574e-05, + "loss": 0.2552, + "step": 2580 + }, + { + "epoch": 14.59, + "grad_norm": 2.166729211807251, + "learning_rate": 1.0273408239700376e-05, + "loss": 0.2466, + "step": 2600 + }, + { + "epoch": 14.7, + "grad_norm": 2.710556745529175, + "learning_rate": 1.0198501872659177e-05, + "loss": 0.2506, + "step": 2620 + }, + { + "epoch": 14.82, + "grad_norm": 2.1344659328460693, + "learning_rate": 1.0123595505617978e-05, + "loss": 0.2388, + "step": 2640 + }, + { + "epoch": 14.93, + "grad_norm": 1.595842719078064, + "learning_rate": 1.004868913857678e-05, + "loss": 0.2553, + "step": 2660 + }, + { + "epoch": 15.04, + "grad_norm": 1.5458731651306152, + "learning_rate": 9.973782771535581e-06, + "loss": 0.2478, + "step": 2680 + }, + { + "epoch": 15.15, + "grad_norm": 1.9514356851577759, + "learning_rate": 9.898876404494382e-06, + "loss": 0.234, + "step": 2700 + }, + { + "epoch": 15.26, + "grad_norm": 2.1551694869995117, + "learning_rate": 9.823970037453184e-06, + "loss": 0.251, + "step": 2720 + }, + { + "epoch": 15.38, + "grad_norm": 2.08258318901062, + "learning_rate": 9.749063670411985e-06, + "loss": 0.2511, + "step": 2740 + }, + { + "epoch": 15.49, + "grad_norm": 1.581690788269043, + "learning_rate": 9.674157303370786e-06, + "loss": 0.2185, + "step": 2760 + }, + { + "epoch": 15.6, + "grad_norm": 2.2121975421905518, + "learning_rate": 9.599250936329588e-06, + "loss": 0.2161, + "step": 2780 + }, + { + "epoch": 15.71, + "grad_norm": 1.5077215433120728, + "learning_rate": 9.52434456928839e-06, + "loss": 0.2308, + "step": 2800 + }, + { + "epoch": 15.83, + "grad_norm": 2.57951021194458, + "learning_rate": 9.449438202247192e-06, + "loss": 0.2299, + "step": 2820 + }, + { + "epoch": 15.94, + "grad_norm": 1.6634414196014404, + "learning_rate": 9.374531835205993e-06, + "loss": 0.2576, + "step": 2840 + }, + { + "epoch": 16.05, + "grad_norm": 1.9692113399505615, + "learning_rate": 9.299625468164795e-06, + "loss": 0.2395, + "step": 2860 + }, + { + "epoch": 16.16, + "grad_norm": 1.9327415227890015, + "learning_rate": 9.224719101123596e-06, + "loss": 0.241, + "step": 2880 + }, + { + "epoch": 16.27, + "grad_norm": 1.7675727605819702, + "learning_rate": 9.149812734082398e-06, + "loss": 0.2201, + "step": 2900 + }, + { + "epoch": 16.39, + "grad_norm": 1.9511345624923706, + "learning_rate": 9.074906367041199e-06, + "loss": 0.2171, + "step": 2920 + }, + { + "epoch": 16.5, + "grad_norm": 1.7937383651733398, + "learning_rate": 9e-06, + "loss": 0.2286, + "step": 2940 + }, + { + "epoch": 16.61, + "grad_norm": 1.79076087474823, + "learning_rate": 8.925093632958802e-06, + "loss": 0.2479, + "step": 2960 + }, + { + "epoch": 16.72, + "grad_norm": 2.4045145511627197, + "learning_rate": 8.850187265917603e-06, + "loss": 0.2153, + "step": 2980 + }, + { + "epoch": 16.84, + "grad_norm": 2.1934499740600586, + "learning_rate": 8.775280898876404e-06, + "loss": 0.2361, + "step": 3000 + }, + { + "epoch": 16.95, + "grad_norm": 1.923170804977417, + "learning_rate": 8.700374531835206e-06, + "loss": 0.2146, + "step": 3020 + }, + { + "epoch": 17.06, + "grad_norm": 2.1610753536224365, + "learning_rate": 8.625468164794009e-06, + "loss": 0.2281, + "step": 3040 + }, + { + "epoch": 17.17, + "grad_norm": 2.1105706691741943, + "learning_rate": 8.55056179775281e-06, + "loss": 0.2403, + "step": 3060 + }, + { + "epoch": 17.29, + "grad_norm": 1.979177474975586, + "learning_rate": 8.475655430711611e-06, + "loss": 0.1734, + "step": 3080 + }, + { + "epoch": 17.4, + "grad_norm": 2.040055274963379, + "learning_rate": 8.400749063670413e-06, + "loss": 0.2393, + "step": 3100 + }, + { + "epoch": 17.51, + "grad_norm": 1.8687106370925903, + "learning_rate": 8.325842696629214e-06, + "loss": 0.2346, + "step": 3120 + }, + { + "epoch": 17.62, + "grad_norm": 1.7447230815887451, + "learning_rate": 8.250936329588015e-06, + "loss": 0.2279, + "step": 3140 + }, + { + "epoch": 17.73, + "grad_norm": 2.9035825729370117, + "learning_rate": 8.176029962546818e-06, + "loss": 0.2049, + "step": 3160 + }, + { + "epoch": 17.85, + "grad_norm": 2.1024608612060547, + "learning_rate": 8.101123595505618e-06, + "loss": 0.1962, + "step": 3180 + }, + { + "epoch": 17.96, + "grad_norm": 2.7913131713867188, + "learning_rate": 8.02621722846442e-06, + "loss": 0.2081, + "step": 3200 + }, + { + "epoch": 18.07, + "grad_norm": 2.0668814182281494, + "learning_rate": 7.95131086142322e-06, + "loss": 0.2304, + "step": 3220 + }, + { + "epoch": 18.18, + "grad_norm": 1.7872204780578613, + "learning_rate": 7.876404494382022e-06, + "loss": 0.1804, + "step": 3240 + }, + { + "epoch": 18.3, + "grad_norm": 2.0718905925750732, + "learning_rate": 7.801498127340823e-06, + "loss": 0.2232, + "step": 3260 + }, + { + "epoch": 18.41, + "grad_norm": 3.835952043533325, + "learning_rate": 7.726591760299626e-06, + "loss": 0.2171, + "step": 3280 + }, + { + "epoch": 18.52, + "grad_norm": 1.5925731658935547, + "learning_rate": 7.651685393258428e-06, + "loss": 0.1999, + "step": 3300 + }, + { + "epoch": 18.63, + "grad_norm": 2.434159994125366, + "learning_rate": 7.576779026217229e-06, + "loss": 0.1876, + "step": 3320 + }, + { + "epoch": 18.74, + "grad_norm": 2.3486499786376953, + "learning_rate": 7.5018726591760305e-06, + "loss": 0.21, + "step": 3340 + }, + { + "epoch": 18.86, + "grad_norm": 1.4824186563491821, + "learning_rate": 7.426966292134832e-06, + "loss": 0.2239, + "step": 3360 + }, + { + "epoch": 18.97, + "grad_norm": 2.062422275543213, + "learning_rate": 7.352059925093633e-06, + "loss": 0.22, + "step": 3380 + }, + { + "epoch": 19.08, + "grad_norm": 2.0563416481018066, + "learning_rate": 7.277153558052435e-06, + "loss": 0.1945, + "step": 3400 + }, + { + "epoch": 19.19, + "grad_norm": 1.6936135292053223, + "learning_rate": 7.202247191011237e-06, + "loss": 0.217, + "step": 3420 + }, + { + "epoch": 19.31, + "grad_norm": 1.9931917190551758, + "learning_rate": 7.127340823970038e-06, + "loss": 0.2127, + "step": 3440 + }, + { + "epoch": 19.42, + "grad_norm": 1.5989198684692383, + "learning_rate": 7.0524344569288395e-06, + "loss": 0.1849, + "step": 3460 + }, + { + "epoch": 19.53, + "grad_norm": 2.0073723793029785, + "learning_rate": 6.977528089887641e-06, + "loss": 0.1805, + "step": 3480 + }, + { + "epoch": 19.64, + "grad_norm": 1.9756735563278198, + "learning_rate": 6.902621722846442e-06, + "loss": 0.1963, + "step": 3500 + }, + { + "epoch": 19.75, + "grad_norm": 1.5112028121948242, + "learning_rate": 6.827715355805244e-06, + "loss": 0.2008, + "step": 3520 + }, + { + "epoch": 19.87, + "grad_norm": 2.2792975902557373, + "learning_rate": 6.752808988764046e-06, + "loss": 0.2106, + "step": 3540 + }, + { + "epoch": 19.98, + "grad_norm": 2.768470048904419, + "learning_rate": 6.677902621722847e-06, + "loss": 0.2066, + "step": 3560 + }, + { + "epoch": 20.09, + "grad_norm": 1.6916066408157349, + "learning_rate": 6.602996254681648e-06, + "loss": 0.1912, + "step": 3580 + }, + { + "epoch": 20.2, + "grad_norm": 1.7649778127670288, + "learning_rate": 6.52808988764045e-06, + "loss": 0.2027, + "step": 3600 + }, + { + "epoch": 20.32, + "grad_norm": 1.9743694067001343, + "learning_rate": 6.453183520599251e-06, + "loss": 0.2208, + "step": 3620 + }, + { + "epoch": 20.43, + "grad_norm": 1.827344298362732, + "learning_rate": 6.378277153558053e-06, + "loss": 0.1757, + "step": 3640 + }, + { + "epoch": 20.54, + "grad_norm": 2.7847957611083984, + "learning_rate": 6.303370786516855e-06, + "loss": 0.1931, + "step": 3660 + }, + { + "epoch": 20.65, + "grad_norm": 1.8572605848312378, + "learning_rate": 6.228464419475656e-06, + "loss": 0.1902, + "step": 3680 + }, + { + "epoch": 20.76, + "grad_norm": 1.5343818664550781, + "learning_rate": 6.153558052434457e-06, + "loss": 0.1916, + "step": 3700 + }, + { + "epoch": 20.88, + "grad_norm": 1.703688383102417, + "learning_rate": 6.078651685393259e-06, + "loss": 0.1897, + "step": 3720 + }, + { + "epoch": 20.99, + "grad_norm": 2.5442187786102295, + "learning_rate": 6.00374531835206e-06, + "loss": 0.1859, + "step": 3740 + }, + { + "epoch": 21.1, + "grad_norm": 2.0333402156829834, + "learning_rate": 5.928838951310862e-06, + "loss": 0.1632, + "step": 3760 + }, + { + "epoch": 21.21, + "grad_norm": 2.107227087020874, + "learning_rate": 5.8539325842696635e-06, + "loss": 0.2031, + "step": 3780 + }, + { + "epoch": 21.33, + "grad_norm": 2.0351223945617676, + "learning_rate": 5.779026217228465e-06, + "loss": 0.1759, + "step": 3800 + }, + { + "epoch": 21.44, + "grad_norm": 2.1328284740448, + "learning_rate": 5.704119850187266e-06, + "loss": 0.1853, + "step": 3820 + }, + { + "epoch": 21.55, + "grad_norm": 2.0145580768585205, + "learning_rate": 5.629213483146068e-06, + "loss": 0.1919, + "step": 3840 + }, + { + "epoch": 21.66, + "grad_norm": 1.8794372081756592, + "learning_rate": 5.554307116104869e-06, + "loss": 0.1958, + "step": 3860 + }, + { + "epoch": 21.77, + "grad_norm": 1.8487616777420044, + "learning_rate": 5.479400749063671e-06, + "loss": 0.207, + "step": 3880 + }, + { + "epoch": 21.89, + "grad_norm": 2.080965042114258, + "learning_rate": 5.4044943820224725e-06, + "loss": 0.1715, + "step": 3900 + }, + { + "epoch": 22.0, + "grad_norm": 2.0303232669830322, + "learning_rate": 5.329588014981274e-06, + "loss": 0.1873, + "step": 3920 + }, + { + "epoch": 22.11, + "grad_norm": 2.1078438758850098, + "learning_rate": 5.254681647940075e-06, + "loss": 0.1869, + "step": 3940 + }, + { + "epoch": 22.22, + "grad_norm": 1.8502501249313354, + "learning_rate": 5.1797752808988765e-06, + "loss": 0.1983, + "step": 3960 + }, + { + "epoch": 22.34, + "grad_norm": 2.209162950515747, + "learning_rate": 5.104868913857678e-06, + "loss": 0.185, + "step": 3980 + }, + { + "epoch": 22.45, + "grad_norm": 1.9525928497314453, + "learning_rate": 5.02996254681648e-06, + "loss": 0.1943, + "step": 4000 + }, + { + "epoch": 22.56, + "grad_norm": 3.0655415058135986, + "learning_rate": 4.955056179775281e-06, + "loss": 0.1738, + "step": 4020 + }, + { + "epoch": 22.67, + "grad_norm": 1.4932396411895752, + "learning_rate": 4.880149812734083e-06, + "loss": 0.1775, + "step": 4040 + }, + { + "epoch": 22.78, + "grad_norm": 2.076929807662964, + "learning_rate": 4.805243445692884e-06, + "loss": 0.1834, + "step": 4060 + }, + { + "epoch": 22.9, + "grad_norm": 1.8913358449935913, + "learning_rate": 4.7303370786516854e-06, + "loss": 0.1601, + "step": 4080 + }, + { + "epoch": 23.01, + "grad_norm": 1.9011529684066772, + "learning_rate": 4.655430711610488e-06, + "loss": 0.1812, + "step": 4100 + }, + { + "epoch": 23.12, + "grad_norm": 1.6414529085159302, + "learning_rate": 4.580524344569289e-06, + "loss": 0.1716, + "step": 4120 + }, + { + "epoch": 23.23, + "grad_norm": 3.7525336742401123, + "learning_rate": 4.50561797752809e-06, + "loss": 0.1795, + "step": 4140 + }, + { + "epoch": 23.35, + "grad_norm": 1.912279486656189, + "learning_rate": 4.430711610486892e-06, + "loss": 0.1906, + "step": 4160 + }, + { + "epoch": 23.46, + "grad_norm": 1.9044945240020752, + "learning_rate": 4.355805243445693e-06, + "loss": 0.1614, + "step": 4180 + }, + { + "epoch": 23.57, + "grad_norm": 2.0123000144958496, + "learning_rate": 4.280898876404494e-06, + "loss": 0.1852, + "step": 4200 + }, + { + "epoch": 23.68, + "grad_norm": 1.8307185173034668, + "learning_rate": 4.2059925093632965e-06, + "loss": 0.1785, + "step": 4220 + }, + { + "epoch": 23.8, + "grad_norm": 3.2872562408447266, + "learning_rate": 4.131086142322098e-06, + "loss": 0.17, + "step": 4240 + }, + { + "epoch": 23.91, + "grad_norm": 1.623343825340271, + "learning_rate": 4.056179775280899e-06, + "loss": 0.1774, + "step": 4260 + }, + { + "epoch": 24.02, + "grad_norm": 1.9367610216140747, + "learning_rate": 3.981273408239701e-06, + "loss": 0.1809, + "step": 4280 + }, + { + "epoch": 24.13, + "grad_norm": 2.030416250228882, + "learning_rate": 3.906367041198502e-06, + "loss": 0.177, + "step": 4300 + }, + { + "epoch": 24.24, + "grad_norm": 2.2556800842285156, + "learning_rate": 3.831460674157303e-06, + "loss": 0.1846, + "step": 4320 + }, + { + "epoch": 24.36, + "grad_norm": 2.8906972408294678, + "learning_rate": 3.7565543071161055e-06, + "loss": 0.1837, + "step": 4340 + }, + { + "epoch": 24.47, + "grad_norm": 1.7102136611938477, + "learning_rate": 3.681647940074907e-06, + "loss": 0.1684, + "step": 4360 + }, + { + "epoch": 24.58, + "grad_norm": 1.753233551979065, + "learning_rate": 3.606741573033708e-06, + "loss": 0.1667, + "step": 4380 + }, + { + "epoch": 24.69, + "grad_norm": 1.9974724054336548, + "learning_rate": 3.53183520599251e-06, + "loss": 0.1732, + "step": 4400 + }, + { + "epoch": 24.81, + "grad_norm": 1.5714670419692993, + "learning_rate": 3.4569288389513113e-06, + "loss": 0.1703, + "step": 4420 + }, + { + "epoch": 24.92, + "grad_norm": 2.6254587173461914, + "learning_rate": 3.3820224719101126e-06, + "loss": 0.1603, + "step": 4440 + }, + { + "epoch": 25.03, + "grad_norm": 2.4333715438842773, + "learning_rate": 3.3071161048689144e-06, + "loss": 0.1669, + "step": 4460 + }, + { + "epoch": 25.14, + "grad_norm": 2.230980157852173, + "learning_rate": 3.2322097378277157e-06, + "loss": 0.1655, + "step": 4480 + }, + { + "epoch": 25.25, + "grad_norm": 1.845352292060852, + "learning_rate": 3.157303370786517e-06, + "loss": 0.1942, + "step": 4500 + }, + { + "epoch": 25.37, + "grad_norm": 1.7430144548416138, + "learning_rate": 3.082397003745319e-06, + "loss": 0.1727, + "step": 4520 + }, + { + "epoch": 25.48, + "grad_norm": 1.5676878690719604, + "learning_rate": 3.00749063670412e-06, + "loss": 0.1761, + "step": 4540 + }, + { + "epoch": 25.59, + "grad_norm": 1.6773077249526978, + "learning_rate": 2.9325842696629215e-06, + "loss": 0.1546, + "step": 4560 + }, + { + "epoch": 25.7, + "grad_norm": 1.5510244369506836, + "learning_rate": 2.8576779026217233e-06, + "loss": 0.1628, + "step": 4580 + }, + { + "epoch": 25.82, + "grad_norm": 1.430743932723999, + "learning_rate": 2.7827715355805247e-06, + "loss": 0.1576, + "step": 4600 + }, + { + "epoch": 25.93, + "grad_norm": 2.015160083770752, + "learning_rate": 2.707865168539326e-06, + "loss": 0.1726, + "step": 4620 + }, + { + "epoch": 26.04, + "grad_norm": 1.9878367185592651, + "learning_rate": 2.6329588014981278e-06, + "loss": 0.1652, + "step": 4640 + }, + { + "epoch": 26.15, + "grad_norm": 1.5033568143844604, + "learning_rate": 2.558052434456929e-06, + "loss": 0.1733, + "step": 4660 + }, + { + "epoch": 26.26, + "grad_norm": 1.7678515911102295, + "learning_rate": 2.4831460674157305e-06, + "loss": 0.1747, + "step": 4680 + }, + { + "epoch": 26.38, + "grad_norm": 2.460773468017578, + "learning_rate": 2.408239700374532e-06, + "loss": 0.1713, + "step": 4700 + }, + { + "epoch": 26.49, + "grad_norm": 1.6902016401290894, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.1701, + "step": 4720 + }, + { + "epoch": 26.6, + "grad_norm": 1.5876681804656982, + "learning_rate": 2.2584269662921354e-06, + "loss": 0.1653, + "step": 4740 + }, + { + "epoch": 26.71, + "grad_norm": 1.7147456407546997, + "learning_rate": 2.1835205992509363e-06, + "loss": 0.1648, + "step": 4760 + }, + { + "epoch": 26.83, + "grad_norm": 1.7766762971878052, + "learning_rate": 2.108614232209738e-06, + "loss": 0.1743, + "step": 4780 + }, + { + "epoch": 26.94, + "grad_norm": 1.535897135734558, + "learning_rate": 2.03370786516854e-06, + "loss": 0.1447, + "step": 4800 + }, + { + "epoch": 27.05, + "grad_norm": 3.248814105987549, + "learning_rate": 1.9588014981273407e-06, + "loss": 0.1612, + "step": 4820 + }, + { + "epoch": 27.16, + "grad_norm": 1.6350877285003662, + "learning_rate": 1.8838951310861425e-06, + "loss": 0.1698, + "step": 4840 + }, + { + "epoch": 27.27, + "grad_norm": 1.852246880531311, + "learning_rate": 1.8089887640449439e-06, + "loss": 0.1613, + "step": 4860 + }, + { + "epoch": 27.39, + "grad_norm": 1.8748161792755127, + "learning_rate": 1.7340823970037454e-06, + "loss": 0.1591, + "step": 4880 + }, + { + "epoch": 27.5, + "grad_norm": 1.5077412128448486, + "learning_rate": 1.659176029962547e-06, + "loss": 0.1705, + "step": 4900 + }, + { + "epoch": 27.61, + "grad_norm": 2.131744861602783, + "learning_rate": 1.5842696629213483e-06, + "loss": 0.1613, + "step": 4920 + }, + { + "epoch": 27.72, + "grad_norm": 2.052886486053467, + "learning_rate": 1.5093632958801499e-06, + "loss": 0.1616, + "step": 4940 + }, + { + "epoch": 27.84, + "grad_norm": 1.8287665843963623, + "learning_rate": 1.4344569288389514e-06, + "loss": 0.1681, + "step": 4960 + }, + { + "epoch": 27.95, + "grad_norm": 1.769261121749878, + "learning_rate": 1.3595505617977528e-06, + "loss": 0.1688, + "step": 4980 + }, + { + "epoch": 28.06, + "grad_norm": 2.0416312217712402, + "learning_rate": 1.2846441947565543e-06, + "loss": 0.1481, + "step": 5000 + } + ], + "logging_steps": 20, + "max_steps": 5340, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 1.2995638935552e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c33ce269deb9707d9fc7879dba5d8878e16b09a --- /dev/null +++ b/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c12b5819a4aa76501d6843eae3aabbe49b1a33a2903a44bc34146ab4a74b6 +size 4920 diff --git a/runs/Mar21_04-12-51_nvqwf0z2of/events.out.tfevents.1710994374.nvqwf0z2of.309.0 b/runs/Mar21_04-12-51_nvqwf0z2of/events.out.tfevents.1710994374.nvqwf0z2of.309.0 new file mode 100644 index 0000000000000000000000000000000000000000..c2d4b8a46c46ba5452658109bb3f8d66d39be2a2 --- /dev/null +++ b/runs/Mar21_04-12-51_nvqwf0z2of/events.out.tfevents.1710994374.nvqwf0z2of.309.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92c7b6ea50c2679d06f2669121c0f65f5183db6ba5d1dba8573cadcac9d22ff9 +size 61794