loubnabnl HF staff commited on
Commit
e682e54
0 Parent(s):

Duplicate from HuggingFaceTB/smollm2-135M-8k-lc100k-dpo-ultaf-ep2

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: HuggingFaceTB/smollm2-135M-8k-lc100k-mix1-ep2
3
+ tags:
4
+ - alignment-handbook
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ - generated_from_trainer
11
+ datasets:
12
+ - HuggingFaceH4/ultrafeedback_binarized
13
+ model-index:
14
+ - name: smollm2-135M-8k-lc100k-dpo-ultaf-ep2
15
+ results: []
16
+ ---
17
+
18
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
19
+ should probably proofread and complete it, then remove this comment. -->
20
+
21
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/loubnabnl/huggingface/runs/3el89rp6)
22
+ # smollm2-135M-8k-lc100k-dpo-ultaf-ep2
23
+
24
+ This model is a fine-tuned version of [HuggingFaceTB/smollm2-135M-8k-lc100k-mix1-ep2](https://huggingface.co/HuggingFaceTB/smollm2-135M-8k-lc100k-mix1-ep2) on the HuggingFaceH4/ultrafeedback_binarized dataset.
25
+ It achieves the following results on the evaluation set:
26
+ - Loss: 0.6741
27
+ - Rewards/chosen: -0.0719
28
+ - Rewards/rejected: -0.3407
29
+ - Rewards/accuracies: 0.6151
30
+ - Rewards/margins: 0.2687
31
+ - Logps/rejected: -378.1583
32
+ - Logps/chosen: -443.6482
33
+ - Logits/rejected: 4.9520
34
+ - Logits/chosen: 4.6009
35
+
36
+ ## Model description
37
+
38
+ More information needed
39
+
40
+ ## Intended uses & limitations
41
+
42
+ More information needed
43
+
44
+ ## Training and evaluation data
45
+
46
+ More information needed
47
+
48
+ ## Training procedure
49
+
50
+ ### Training hyperparameters
51
+
52
+ The following hyperparameters were used during training:
53
+ - learning_rate: 1e-06
54
+ - train_batch_size: 2
55
+ - eval_batch_size: 4
56
+ - seed: 42
57
+ - distributed_type: multi-GPU
58
+ - num_devices: 8
59
+ - gradient_accumulation_steps: 8
60
+ - total_train_batch_size: 128
61
+ - total_eval_batch_size: 32
62
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
63
+ - lr_scheduler_type: cosine
64
+ - lr_scheduler_warmup_ratio: 0.1
65
+ - num_epochs: 2
66
+
67
+ ### Training results
68
+
69
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
70
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
71
+ | 0.7296 | 0.2094 | 100 | 0.7357 | 0.0117 | -0.0252 | 0.5516 | 0.0369 | -377.5274 | -443.4810 | 5.1272 | 4.7554 |
72
+ | 0.7062 | 0.4187 | 200 | 0.6988 | -0.0251 | -0.0968 | 0.5675 | 0.0717 | -377.6706 | -443.5545 | 5.0879 | 4.7255 |
73
+ | 0.6782 | 0.6281 | 300 | 0.6943 | -0.0323 | -0.2031 | 0.5675 | 0.1708 | -377.8831 | -443.5688 | 5.0161 | 4.6621 |
74
+ | 0.6863 | 0.8375 | 400 | 0.6757 | -0.0882 | -0.2789 | 0.5992 | 0.1907 | -378.0348 | -443.6808 | 4.9992 | 4.6459 |
75
+ | 0.6836 | 1.0468 | 500 | 0.6708 | -0.0957 | -0.3325 | 0.6349 | 0.2368 | -378.1419 | -443.6958 | 4.9696 | 4.6170 |
76
+ | 0.6349 | 1.2562 | 600 | 0.6720 | -0.0539 | -0.3214 | 0.5992 | 0.2675 | -378.1197 | -443.6121 | 4.9707 | 4.6203 |
77
+ | 0.6427 | 1.4656 | 700 | 0.6796 | -0.0877 | -0.3456 | 0.6032 | 0.2579 | -378.1681 | -443.6797 | 4.9430 | 4.5920 |
78
+ | 0.6128 | 1.6750 | 800 | 0.6704 | -0.0604 | -0.3680 | 0.6071 | 0.3075 | -378.2128 | -443.6252 | 4.9689 | 4.6106 |
79
+ | 0.6474 | 1.8843 | 900 | 0.6692 | -0.0590 | -0.3703 | 0.6270 | 0.3113 | -378.2174 | -443.6223 | 4.9211 | 4.5737 |
80
+
81
+
82
+ ### Framework versions
83
+
84
+ - Transformers 4.42.3
85
+ - Pytorch 2.1.2
86
+ - Datasets 2.20.0
87
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.9973828840617638,
3
+ "eval_logits/chosen": 4.600930213928223,
4
+ "eval_logits/rejected": 4.9520039558410645,
5
+ "eval_logps/chosen": -443.648193359375,
6
+ "eval_logps/rejected": -378.15826416015625,
7
+ "eval_loss": 0.6740825176239014,
8
+ "eval_rewards/accuracies": 0.6150793433189392,
9
+ "eval_rewards/chosen": -0.07192634046077728,
10
+ "eval_rewards/margins": 0.26874542236328125,
11
+ "eval_rewards/rejected": -0.34067174792289734,
12
+ "eval_runtime": 20.4479,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 97.81,
15
+ "eval_steps_per_second": 3.081,
16
+ "total_flos": 0.0,
17
+ "train_loss": 0.675485389037702,
18
+ "train_runtime": 5897.7907,
19
+ "train_samples": 61134,
20
+ "train_samples_per_second": 20.731,
21
+ "train_steps_per_second": 0.162
22
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "HuggingFaceTB/smollm2-135M-8k-lc100k-mix1-ep2",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 576,
12
+ "initializer_range": 0.041666666666666664,
13
+ "intermediate_size": 1536,
14
+ "is_llama_config": true,
15
+ "max_position_embeddings": 8192,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 9,
19
+ "num_hidden_layers": 30,
20
+ "num_key_value_heads": 3,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_interleaved": false,
25
+ "rope_scaling": null,
26
+ "rope_theta": 100000,
27
+ "tie_word_embeddings": true,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.42.3",
30
+ "use_cache": true,
31
+ "vocab_size": 49152
32
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.9973828840617638,
3
+ "eval_logits/chosen": 4.600930213928223,
4
+ "eval_logits/rejected": 4.9520039558410645,
5
+ "eval_logps/chosen": -443.648193359375,
6
+ "eval_logps/rejected": -378.15826416015625,
7
+ "eval_loss": 0.6740825176239014,
8
+ "eval_rewards/accuracies": 0.6150793433189392,
9
+ "eval_rewards/chosen": -0.07192634046077728,
10
+ "eval_rewards/margins": 0.26874542236328125,
11
+ "eval_rewards/rejected": -0.34067174792289734,
12
+ "eval_runtime": 20.4479,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 97.81,
15
+ "eval_steps_per_second": 3.081
16
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.42.3"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5af571cbf074e6d21a03528d2330792e532ca608f24ac70a143f6b369968ab8c
3
+ size 269060552
runs/Oct31_10-14-22_ip-26-0-174-36/events.out.tfevents.1730370128.ip-26-0-174-36.3239327.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:774faae98f36baa1ee525b594023c5fc25b4ad9249e04ccdf4cba9ee4f4585fd
3
+ size 78157
runs/Oct31_10-14-22_ip-26-0-174-36/events.out.tfevents.1730376056.ip-26-0-174-36.3239327.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:043a35029ecb401ef0df75148304075329cd444faf2097b0316e9200018fea5c
3
+ size 828
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|im_start|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|im_start|>",
143
+ "<|im_end|>"
144
+ ],
145
+ "bos_token": "<|im_start|>",
146
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
147
+ "clean_up_tokenization_spaces": false,
148
+ "eos_token": "<|im_end|>",
149
+ "model_max_length": 2048,
150
+ "pad_token": "<|im_end|>",
151
+ "tokenizer_class": "GPT2Tokenizer",
152
+ "unk_token": "<|endoftext|>",
153
+ "vocab_size": 49152
154
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.9973828840617638,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.675485389037702,
5
+ "train_runtime": 5897.7907,
6
+ "train_samples": 61134,
7
+ "train_samples_per_second": 20.731,
8
+ "train_steps_per_second": 0.162
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9973828840617638,
5
+ "eval_steps": 100,
6
+ "global_step": 954,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.002093692750588851,
13
+ "grad_norm": 38.333577570579074,
14
+ "learning_rate": 1.0416666666666666e-08,
15
+ "logits/chosen": 5.468747138977051,
16
+ "logits/rejected": 5.353150367736816,
17
+ "logps/chosen": -399.0700988769531,
18
+ "logps/rejected": -414.2703857421875,
19
+ "loss": 0.6931,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.02093692750588851,
28
+ "grad_norm": 36.45097456473781,
29
+ "learning_rate": 1.0416666666666667e-07,
30
+ "logits/chosen": 4.634159088134766,
31
+ "logits/rejected": 4.8650617599487305,
32
+ "logps/chosen": -481.9865417480469,
33
+ "logps/rejected": -402.9172668457031,
34
+ "loss": 0.7192,
35
+ "rewards/accuracies": 0.4236111044883728,
36
+ "rewards/chosen": 0.036201052367687225,
37
+ "rewards/margins": 0.05521820858120918,
38
+ "rewards/rejected": -0.01901715248823166,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.04187385501177702,
43
+ "grad_norm": 38.31461130932718,
44
+ "learning_rate": 2.0833333333333333e-07,
45
+ "logits/chosen": 4.8017449378967285,
46
+ "logits/rejected": 5.193596363067627,
47
+ "logps/chosen": -428.74591064453125,
48
+ "logps/rejected": -379.7098693847656,
49
+ "loss": 0.7525,
50
+ "rewards/accuracies": 0.512499988079071,
51
+ "rewards/chosen": 0.014457901008427143,
52
+ "rewards/margins": -0.03727109357714653,
53
+ "rewards/rejected": 0.0517289862036705,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.06281078251766553,
58
+ "grad_norm": 38.39033659648525,
59
+ "learning_rate": 3.1249999999999997e-07,
60
+ "logits/chosen": 4.625308513641357,
61
+ "logits/rejected": 4.913487434387207,
62
+ "logps/chosen": -459.8934631347656,
63
+ "logps/rejected": -365.87176513671875,
64
+ "loss": 0.7389,
65
+ "rewards/accuracies": 0.543749988079071,
66
+ "rewards/chosen": 0.032367587089538574,
67
+ "rewards/margins": 0.07106685638427734,
68
+ "rewards/rejected": -0.03869926929473877,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.08374771002355404,
73
+ "grad_norm": 34.959297165636315,
74
+ "learning_rate": 4.1666666666666667e-07,
75
+ "logits/chosen": 5.135643005371094,
76
+ "logits/rejected": 5.29467248916626,
77
+ "logps/chosen": -388.5003662109375,
78
+ "logps/rejected": -341.11138916015625,
79
+ "loss": 0.7521,
80
+ "rewards/accuracies": 0.44999998807907104,
81
+ "rewards/chosen": -0.061884332448244095,
82
+ "rewards/margins": -0.051958512514829636,
83
+ "rewards/rejected": -0.009925814345479012,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.10468463752944256,
88
+ "grad_norm": 40.59746593571697,
89
+ "learning_rate": 5.208333333333334e-07,
90
+ "logits/chosen": 4.794947147369385,
91
+ "logits/rejected": 5.206262111663818,
92
+ "logps/chosen": -418.7637634277344,
93
+ "logps/rejected": -366.21783447265625,
94
+ "loss": 0.7539,
95
+ "rewards/accuracies": 0.5249999761581421,
96
+ "rewards/chosen": -0.0046131848357617855,
97
+ "rewards/margins": 0.009517465718090534,
98
+ "rewards/rejected": -0.014130651950836182,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.12562156503533106,
103
+ "grad_norm": 35.116262927070615,
104
+ "learning_rate": 6.249999999999999e-07,
105
+ "logits/chosen": 4.984349250793457,
106
+ "logits/rejected": 5.210784435272217,
107
+ "logps/chosen": -389.5479431152344,
108
+ "logps/rejected": -355.3258361816406,
109
+ "loss": 0.7337,
110
+ "rewards/accuracies": 0.581250011920929,
111
+ "rewards/chosen": 0.09027661383152008,
112
+ "rewards/margins": 0.13924987614154816,
113
+ "rewards/rejected": -0.048973266035318375,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.14655849254121958,
118
+ "grad_norm": 37.65630163274615,
119
+ "learning_rate": 7.291666666666666e-07,
120
+ "logits/chosen": 5.079476356506348,
121
+ "logits/rejected": 5.1062331199646,
122
+ "logps/chosen": -472.6788635253906,
123
+ "logps/rejected": -410.6566467285156,
124
+ "loss": 0.7532,
125
+ "rewards/accuracies": 0.48750001192092896,
126
+ "rewards/chosen": 0.01137494295835495,
127
+ "rewards/margins": -0.019245151430368423,
128
+ "rewards/rejected": 0.030620098114013672,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.16749542004710807,
133
+ "grad_norm": 36.35209638887961,
134
+ "learning_rate": 8.333333333333333e-07,
135
+ "logits/chosen": 4.831971645355225,
136
+ "logits/rejected": 5.179555892944336,
137
+ "logps/chosen": -465.8661193847656,
138
+ "logps/rejected": -352.46063232421875,
139
+ "loss": 0.7337,
140
+ "rewards/accuracies": 0.5249999761581421,
141
+ "rewards/chosen": 0.029347699135541916,
142
+ "rewards/margins": 0.011580700054764748,
143
+ "rewards/rejected": 0.017767000943422318,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.1884323475529966,
148
+ "grad_norm": 40.247342074541066,
149
+ "learning_rate": 9.374999999999999e-07,
150
+ "logits/chosen": 4.667853355407715,
151
+ "logits/rejected": 5.083367347717285,
152
+ "logps/chosen": -410.145263671875,
153
+ "logps/rejected": -347.24871826171875,
154
+ "loss": 0.7325,
155
+ "rewards/accuracies": 0.5625,
156
+ "rewards/chosen": 0.04706032946705818,
157
+ "rewards/margins": 0.05596155673265457,
158
+ "rewards/rejected": -0.008901228196918964,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.2093692750588851,
163
+ "grad_norm": 36.88448509348576,
164
+ "learning_rate": 9.999463737538052e-07,
165
+ "logits/chosen": 5.017066955566406,
166
+ "logits/rejected": 5.157826900482178,
167
+ "logps/chosen": -453.6114196777344,
168
+ "logps/rejected": -376.13214111328125,
169
+ "loss": 0.7296,
170
+ "rewards/accuracies": 0.543749988079071,
171
+ "rewards/chosen": 0.03123999759554863,
172
+ "rewards/margins": 0.038888636976480484,
173
+ "rewards/rejected": -0.007648637983947992,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.2093692750588851,
178
+ "eval_logits/chosen": 4.755386829376221,
179
+ "eval_logits/rejected": 5.127224445343018,
180
+ "eval_logps/chosen": -443.48101806640625,
181
+ "eval_logps/rejected": -377.5273742675781,
182
+ "eval_loss": 0.7357296347618103,
183
+ "eval_rewards/accuracies": 0.5515872836112976,
184
+ "eval_rewards/chosen": 0.01168334111571312,
185
+ "eval_rewards/margins": 0.03692733868956566,
186
+ "eval_rewards/rejected": -0.025244001299142838,
187
+ "eval_runtime": 21.3186,
188
+ "eval_samples_per_second": 93.815,
189
+ "eval_steps_per_second": 2.955,
190
+ "step": 100
191
+ },
192
+ {
193
+ "epoch": 0.23030620256477363,
194
+ "grad_norm": 36.21924654761057,
195
+ "learning_rate": 9.993432105822034e-07,
196
+ "logits/chosen": 4.768385887145996,
197
+ "logits/rejected": 5.076653957366943,
198
+ "logps/chosen": -449.16375732421875,
199
+ "logps/rejected": -369.2919006347656,
200
+ "loss": 0.7211,
201
+ "rewards/accuracies": 0.42500001192092896,
202
+ "rewards/chosen": -0.04003235697746277,
203
+ "rewards/margins": -0.059906214475631714,
204
+ "rewards/rejected": 0.019873863086104393,
205
+ "step": 110
206
+ },
207
+ {
208
+ "epoch": 0.2512431300706621,
209
+ "grad_norm": 36.12599439727971,
210
+ "learning_rate": 9.980706626858607e-07,
211
+ "logits/chosen": 5.0697174072265625,
212
+ "logits/rejected": 5.350961208343506,
213
+ "logps/chosen": -392.9084777832031,
214
+ "logps/rejected": -342.9964294433594,
215
+ "loss": 0.7213,
216
+ "rewards/accuracies": 0.543749988079071,
217
+ "rewards/chosen": 0.005513651762157679,
218
+ "rewards/margins": 0.07320307195186615,
219
+ "rewards/rejected": -0.06768941879272461,
220
+ "step": 120
221
+ },
222
+ {
223
+ "epoch": 0.2721800575765506,
224
+ "grad_norm": 43.81008737879554,
225
+ "learning_rate": 9.961304359538434e-07,
226
+ "logits/chosen": 4.7396440505981445,
227
+ "logits/rejected": 5.110291957855225,
228
+ "logps/chosen": -445.08209228515625,
229
+ "logps/rejected": -356.9689636230469,
230
+ "loss": 0.7319,
231
+ "rewards/accuracies": 0.543749988079071,
232
+ "rewards/chosen": 0.03245898336172104,
233
+ "rewards/margins": 0.0837341919541359,
234
+ "rewards/rejected": -0.05127520486712456,
235
+ "step": 130
236
+ },
237
+ {
238
+ "epoch": 0.29311698508243916,
239
+ "grad_norm": 37.12091478913465,
240
+ "learning_rate": 9.935251313189563e-07,
241
+ "logits/chosen": 4.5339274406433105,
242
+ "logits/rejected": 5.020459175109863,
243
+ "logps/chosen": -473.4126892089844,
244
+ "logps/rejected": -364.12939453125,
245
+ "loss": 0.7193,
246
+ "rewards/accuracies": 0.5562499761581421,
247
+ "rewards/chosen": 0.008073748089373112,
248
+ "rewards/margins": 0.035515300929546356,
249
+ "rewards/rejected": -0.02744155190885067,
250
+ "step": 140
251
+ },
252
+ {
253
+ "epoch": 0.31405391258832765,
254
+ "grad_norm": 36.18564480374988,
255
+ "learning_rate": 9.902582412711118e-07,
256
+ "logits/chosen": 4.540812969207764,
257
+ "logits/rejected": 4.964258193969727,
258
+ "logps/chosen": -426.5033264160156,
259
+ "logps/rejected": -353.1463317871094,
260
+ "loss": 0.7232,
261
+ "rewards/accuracies": 0.512499988079071,
262
+ "rewards/chosen": -0.01480414904654026,
263
+ "rewards/margins": 0.06241898611187935,
264
+ "rewards/rejected": -0.07722313702106476,
265
+ "step": 150
266
+ },
267
+ {
268
+ "epoch": 0.33499084009421615,
269
+ "grad_norm": 32.90233228487631,
270
+ "learning_rate": 9.86334145175542e-07,
271
+ "logits/chosen": 4.807779788970947,
272
+ "logits/rejected": 5.042156219482422,
273
+ "logps/chosen": -396.0440673828125,
274
+ "logps/rejected": -360.52886962890625,
275
+ "loss": 0.7013,
276
+ "rewards/accuracies": 0.5249999761581421,
277
+ "rewards/chosen": 0.010227044112980366,
278
+ "rewards/margins": 0.055318038910627365,
279
+ "rewards/rejected": -0.045090995728969574,
280
+ "step": 160
281
+ },
282
+ {
283
+ "epoch": 0.3559277676001047,
284
+ "grad_norm": 37.311964741290105,
285
+ "learning_rate": 9.817581034021272e-07,
286
+ "logits/chosen": 4.897703170776367,
287
+ "logits/rejected": 5.062272071838379,
288
+ "logps/chosen": -389.55810546875,
289
+ "logps/rejected": -329.748779296875,
290
+ "loss": 0.7043,
291
+ "rewards/accuracies": 0.518750011920929,
292
+ "rewards/chosen": -0.008063828572630882,
293
+ "rewards/margins": 0.049024712294340134,
294
+ "rewards/rejected": -0.05708853527903557,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 0.3768646951059932,
299
+ "grad_norm": 34.14102924438106,
300
+ "learning_rate": 9.765362502737097e-07,
301
+ "logits/chosen": 5.039429187774658,
302
+ "logits/rejected": 5.049492835998535,
303
+ "logps/chosen": -384.9471130371094,
304
+ "logps/rejected": -381.6601257324219,
305
+ "loss": 0.7091,
306
+ "rewards/accuracies": 0.5562499761581421,
307
+ "rewards/chosen": 0.004380516707897186,
308
+ "rewards/margins": 0.038515396416187286,
309
+ "rewards/rejected": -0.0341348834335804,
310
+ "step": 180
311
+ },
312
+ {
313
+ "epoch": 0.39780162261188173,
314
+ "grad_norm": 35.88568083270778,
315
+ "learning_rate": 9.706755858428485e-07,
316
+ "logits/chosen": 5.025214195251465,
317
+ "logits/rejected": 5.097342014312744,
318
+ "logps/chosen": -397.56402587890625,
319
+ "logps/rejected": -396.12799072265625,
320
+ "loss": 0.7161,
321
+ "rewards/accuracies": 0.518750011920929,
322
+ "rewards/chosen": -0.056682147085666656,
323
+ "rewards/margins": 0.023842817172408104,
324
+ "rewards/rejected": -0.08052496612071991,
325
+ "step": 190
326
+ },
327
+ {
328
+ "epoch": 0.4187385501177702,
329
+ "grad_norm": 38.60485416145746,
330
+ "learning_rate": 9.641839665080363e-07,
331
+ "logits/chosen": 5.1590471267700195,
332
+ "logits/rejected": 5.290652275085449,
333
+ "logps/chosen": -399.9522399902344,
334
+ "logps/rejected": -363.53936767578125,
335
+ "loss": 0.7062,
336
+ "rewards/accuracies": 0.53125,
337
+ "rewards/chosen": 0.04297986626625061,
338
+ "rewards/margins": 0.06266864389181137,
339
+ "rewards/rejected": -0.019688773900270462,
340
+ "step": 200
341
+ },
342
+ {
343
+ "epoch": 0.4187385501177702,
344
+ "eval_logits/chosen": 4.725487232208252,
345
+ "eval_logits/rejected": 5.087867736816406,
346
+ "eval_logps/chosen": -443.55450439453125,
347
+ "eval_logps/rejected": -377.6705627441406,
348
+ "eval_loss": 0.6988219022750854,
349
+ "eval_rewards/accuracies": 0.567460298538208,
350
+ "eval_rewards/chosen": -0.025081120431423187,
351
+ "eval_rewards/margins": 0.07174728065729141,
352
+ "eval_rewards/rejected": -0.0968284010887146,
353
+ "eval_runtime": 21.5315,
354
+ "eval_samples_per_second": 92.887,
355
+ "eval_steps_per_second": 2.926,
356
+ "step": 200
357
+ },
358
+ {
359
+ "epoch": 0.4396754776236587,
360
+ "grad_norm": 36.10684145537435,
361
+ "learning_rate": 9.570700944819582e-07,
362
+ "logits/chosen": 4.827897548675537,
363
+ "logits/rejected": 5.154609680175781,
364
+ "logps/chosen": -451.2969665527344,
365
+ "logps/rejected": -372.116455078125,
366
+ "loss": 0.7141,
367
+ "rewards/accuracies": 0.543749988079071,
368
+ "rewards/chosen": -0.07117662578821182,
369
+ "rewards/margins": 0.02444976009428501,
370
+ "rewards/rejected": -0.09562637656927109,
371
+ "step": 210
372
+ },
373
+ {
374
+ "epoch": 0.46061240512954726,
375
+ "grad_norm": 33.13953697631782,
376
+ "learning_rate": 9.493435061259129e-07,
377
+ "logits/chosen": 5.24191427230835,
378
+ "logits/rejected": 5.477172374725342,
379
+ "logps/chosen": -365.3572692871094,
380
+ "logps/rejected": -345.34814453125,
381
+ "loss": 0.7138,
382
+ "rewards/accuracies": 0.48750001192092896,
383
+ "rewards/chosen": -0.07684006541967392,
384
+ "rewards/margins": -0.033464811742305756,
385
+ "rewards/rejected": -0.043375253677368164,
386
+ "step": 220
387
+ },
388
+ {
389
+ "epoch": 0.48154933263543576,
390
+ "grad_norm": 36.46869252662201,
391
+ "learning_rate": 9.4101455916603e-07,
392
+ "logits/chosen": 4.996638298034668,
393
+ "logits/rejected": 5.203185081481934,
394
+ "logps/chosen": -390.2169494628906,
395
+ "logps/rejected": -381.74090576171875,
396
+ "loss": 0.7027,
397
+ "rewards/accuracies": 0.574999988079071,
398
+ "rewards/chosen": -0.006761978380382061,
399
+ "rewards/margins": 0.11298926174640656,
400
+ "rewards/rejected": -0.11975125223398209,
401
+ "step": 230
402
+ },
403
+ {
404
+ "epoch": 0.5024862601413242,
405
+ "grad_norm": 32.465417090707724,
406
+ "learning_rate": 9.320944188084241e-07,
407
+ "logits/chosen": 4.961588382720947,
408
+ "logits/rejected": 5.104936122894287,
409
+ "logps/chosen": -405.73651123046875,
410
+ "logps/rejected": -368.39312744140625,
411
+ "loss": 0.6966,
412
+ "rewards/accuracies": 0.612500011920929,
413
+ "rewards/chosen": 0.04590672254562378,
414
+ "rewards/margins": 0.21882851421833038,
415
+ "rewards/rejected": -0.1729217916727066,
416
+ "step": 240
417
+ },
418
+ {
419
+ "epoch": 0.5234231876472127,
420
+ "grad_norm": 32.56446548910322,
421
+ "learning_rate": 9.225950427718974e-07,
422
+ "logits/chosen": 4.295259475708008,
423
+ "logits/rejected": 4.731950759887695,
424
+ "logps/chosen": -457.6085510253906,
425
+ "logps/rejected": -378.92083740234375,
426
+ "loss": 0.6895,
427
+ "rewards/accuracies": 0.5249999761581421,
428
+ "rewards/chosen": -0.08796132355928421,
429
+ "rewards/margins": 0.07854396849870682,
430
+ "rewards/rejected": -0.16650527715682983,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 0.5443601151531012,
435
+ "grad_norm": 37.3555609307497,
436
+ "learning_rate": 9.125291652582547e-07,
437
+ "logits/chosen": 4.772681713104248,
438
+ "logits/rejected": 4.774602890014648,
439
+ "logps/chosen": -429.554931640625,
440
+ "logps/rejected": -350.5638122558594,
441
+ "loss": 0.697,
442
+ "rewards/accuracies": 0.574999988079071,
443
+ "rewards/chosen": 0.017600687220692635,
444
+ "rewards/margins": 0.15435068309307098,
445
+ "rewards/rejected": -0.1367500126361847,
446
+ "step": 260
447
+ },
448
+ {
449
+ "epoch": 0.5652970426589898,
450
+ "grad_norm": 33.42195670184768,
451
+ "learning_rate": 9.019102798817195e-07,
452
+ "logits/chosen": 4.580355644226074,
453
+ "logits/rejected": 5.011557102203369,
454
+ "logps/chosen": -446.68438720703125,
455
+ "logps/rejected": -380.5400390625,
456
+ "loss": 0.6778,
457
+ "rewards/accuracies": 0.5625,
458
+ "rewards/chosen": 0.019351882860064507,
459
+ "rewards/margins": 0.15824225544929504,
460
+ "rewards/rejected": -0.13889038562774658,
461
+ "step": 270
462
+ },
463
+ {
464
+ "epoch": 0.5862339701648783,
465
+ "grad_norm": 39.119947805234894,
466
+ "learning_rate": 8.90752621580335e-07,
467
+ "logits/chosen": 5.025314807891846,
468
+ "logits/rejected": 5.18468713760376,
469
+ "logps/chosen": -424.27191162109375,
470
+ "logps/rejected": -344.4115295410156,
471
+ "loss": 0.7073,
472
+ "rewards/accuracies": 0.625,
473
+ "rewards/chosen": 0.023305395618081093,
474
+ "rewards/margins": 0.14927226305007935,
475
+ "rewards/rejected": -0.125966876745224,
476
+ "step": 280
477
+ },
478
+ {
479
+ "epoch": 0.6071708976707668,
480
+ "grad_norm": 33.472133866310024,
481
+ "learning_rate": 8.79071147533597e-07,
482
+ "logits/chosen": 4.961835861206055,
483
+ "logits/rejected": 5.123082637786865,
484
+ "logps/chosen": -400.4245300292969,
485
+ "logps/rejected": -388.8963623046875,
486
+ "loss": 0.6687,
487
+ "rewards/accuracies": 0.643750011920929,
488
+ "rewards/chosen": 0.08449111878871918,
489
+ "rewards/margins": 0.2874522805213928,
490
+ "rewards/rejected": -0.20296116173267365,
491
+ "step": 290
492
+ },
493
+ {
494
+ "epoch": 0.6281078251766553,
495
+ "grad_norm": 38.29458361274397,
496
+ "learning_rate": 8.668815171119019e-07,
497
+ "logits/chosen": 4.6071085929870605,
498
+ "logits/rejected": 4.85768985748291,
499
+ "logps/chosen": -445.59393310546875,
500
+ "logps/rejected": -373.83636474609375,
501
+ "loss": 0.6782,
502
+ "rewards/accuracies": 0.643750011920929,
503
+ "rewards/chosen": 0.043323811143636703,
504
+ "rewards/margins": 0.2540794312953949,
505
+ "rewards/rejected": -0.2107556313276291,
506
+ "step": 300
507
+ },
508
+ {
509
+ "epoch": 0.6281078251766553,
510
+ "eval_logits/chosen": 4.662118434906006,
511
+ "eval_logits/rejected": 5.016141414642334,
512
+ "eval_logps/chosen": -443.56884765625,
513
+ "eval_logps/rejected": -377.883056640625,
514
+ "eval_loss": 0.6942777037620544,
515
+ "eval_rewards/accuracies": 0.567460298538208,
516
+ "eval_rewards/chosen": -0.03225937858223915,
517
+ "eval_rewards/margins": 0.17080551385879517,
518
+ "eval_rewards/rejected": -0.20306488871574402,
519
+ "eval_runtime": 21.6344,
520
+ "eval_samples_per_second": 92.445,
521
+ "eval_steps_per_second": 2.912,
522
+ "step": 300
523
+ },
524
+ {
525
+ "epoch": 0.6490447526825438,
526
+ "grad_norm": 35.277636776310345,
527
+ "learning_rate": 8.54200070884685e-07,
528
+ "logits/chosen": 4.7398271560668945,
529
+ "logits/rejected": 5.0438690185546875,
530
+ "logps/chosen": -455.08074951171875,
531
+ "logps/rejected": -346.21905517578125,
532
+ "loss": 0.6648,
533
+ "rewards/accuracies": 0.5874999761581421,
534
+ "rewards/chosen": -0.00532907247543335,
535
+ "rewards/margins": 0.22818481922149658,
536
+ "rewards/rejected": -0.23351387679576874,
537
+ "step": 310
538
+ },
539
+ {
540
+ "epoch": 0.6699816801884323,
541
+ "grad_norm": 36.1242464612453,
542
+ "learning_rate": 8.410438087153911e-07,
543
+ "logits/chosen": 4.823008060455322,
544
+ "logits/rejected": 4.949624538421631,
545
+ "logps/chosen": -420.04150390625,
546
+ "logps/rejected": -346.31134033203125,
547
+ "loss": 0.6633,
548
+ "rewards/accuracies": 0.5625,
549
+ "rewards/chosen": -0.02057427167892456,
550
+ "rewards/margins": 0.1925923228263855,
551
+ "rewards/rejected": -0.21316656470298767,
552
+ "step": 320
553
+ },
554
+ {
555
+ "epoch": 0.6909186076943209,
556
+ "grad_norm": 34.485635067716444,
557
+ "learning_rate": 8.274303669726426e-07,
558
+ "logits/chosen": 4.866278171539307,
559
+ "logits/rejected": 5.084838390350342,
560
+ "logps/chosen": -413.07647705078125,
561
+ "logps/rejected": -359.72637939453125,
562
+ "loss": 0.6964,
563
+ "rewards/accuracies": 0.5562499761581421,
564
+ "rewards/chosen": -0.0661710649728775,
565
+ "rewards/margins": 0.11275775730609894,
566
+ "rewards/rejected": -0.17892882227897644,
567
+ "step": 330
568
+ },
569
+ {
570
+ "epoch": 0.7118555352002094,
571
+ "grad_norm": 36.021204090397475,
572
+ "learning_rate": 8.133779948881513e-07,
573
+ "logits/chosen": 4.962647914886475,
574
+ "logits/rejected": 5.274256229400635,
575
+ "logps/chosen": -423.34796142578125,
576
+ "logps/rejected": -374.83831787109375,
577
+ "loss": 0.6843,
578
+ "rewards/accuracies": 0.581250011920929,
579
+ "rewards/chosen": -0.06308840215206146,
580
+ "rewards/margins": 0.14286582171916962,
581
+ "rewards/rejected": -0.20595422387123108,
582
+ "step": 340
583
+ },
584
+ {
585
+ "epoch": 0.7327924627060979,
586
+ "grad_norm": 40.80633953486743,
587
+ "learning_rate": 7.989055300930704e-07,
588
+ "logits/chosen": 4.9410552978515625,
589
+ "logits/rejected": 5.171365737915039,
590
+ "logps/chosen": -401.3800048828125,
591
+ "logps/rejected": -339.8207702636719,
592
+ "loss": 0.6799,
593
+ "rewards/accuracies": 0.581250011920929,
594
+ "rewards/chosen": -0.05132729932665825,
595
+ "rewards/margins": 0.14917483925819397,
596
+ "rewards/rejected": -0.20050212740898132,
597
+ "step": 350
598
+ },
599
+ {
600
+ "epoch": 0.7537293902119864,
601
+ "grad_norm": 32.290367261199215,
602
+ "learning_rate": 7.840323733655778e-07,
603
+ "logits/chosen": 4.760105609893799,
604
+ "logits/rejected": 4.936800956726074,
605
+ "logps/chosen": -475.94305419921875,
606
+ "logps/rejected": -373.4317626953125,
607
+ "loss": 0.6723,
608
+ "rewards/accuracies": 0.65625,
609
+ "rewards/chosen": 0.040363796055316925,
610
+ "rewards/margins": 0.26596716046333313,
611
+ "rewards/rejected": -0.2256033718585968,
612
+ "step": 360
613
+ },
614
+ {
615
+ "epoch": 0.7746663177178749,
616
+ "grad_norm": 33.44911142948228,
617
+ "learning_rate": 7.687784626235447e-07,
618
+ "logits/chosen": 4.649796485900879,
619
+ "logits/rejected": 4.882054328918457,
620
+ "logps/chosen": -437.54791259765625,
621
+ "logps/rejected": -343.1330871582031,
622
+ "loss": 0.6722,
623
+ "rewards/accuracies": 0.6000000238418579,
624
+ "rewards/chosen": -0.03584844991564751,
625
+ "rewards/margins": 0.2080194056034088,
626
+ "rewards/rejected": -0.2438678741455078,
627
+ "step": 370
628
+ },
629
+ {
630
+ "epoch": 0.7956032452237635,
631
+ "grad_norm": 32.64019199720913,
632
+ "learning_rate": 7.531642461971514e-07,
633
+ "logits/chosen": 4.7331953048706055,
634
+ "logits/rejected": 5.047934532165527,
635
+ "logps/chosen": -434.0751953125,
636
+ "logps/rejected": -363.1179504394531,
637
+ "loss": 0.673,
638
+ "rewards/accuracies": 0.581250011920929,
639
+ "rewards/chosen": -0.03724004700779915,
640
+ "rewards/margins": 0.23209133744239807,
641
+ "rewards/rejected": -0.2693313956260681,
642
+ "step": 380
643
+ },
644
+ {
645
+ "epoch": 0.816540172729652,
646
+ "grad_norm": 33.38515796622506,
647
+ "learning_rate": 7.372106554172801e-07,
648
+ "logits/chosen": 4.660643577575684,
649
+ "logits/rejected": 4.7719621658325195,
650
+ "logps/chosen": -434.41015625,
651
+ "logps/rejected": -394.9471130371094,
652
+ "loss": 0.6778,
653
+ "rewards/accuracies": 0.59375,
654
+ "rewards/chosen": 0.021881069988012314,
655
+ "rewards/margins": 0.2415298968553543,
656
+ "rewards/rejected": -0.2196488082408905,
657
+ "step": 390
658
+ },
659
+ {
660
+ "epoch": 0.8374771002355405,
661
+ "grad_norm": 32.60957530709497,
662
+ "learning_rate": 7.209390765564318e-07,
663
+ "logits/chosen": 4.807684421539307,
664
+ "logits/rejected": 5.217709541320801,
665
+ "logps/chosen": -368.07122802734375,
666
+ "logps/rejected": -328.12066650390625,
667
+ "loss": 0.6863,
668
+ "rewards/accuracies": 0.59375,
669
+ "rewards/chosen": -0.03271085396409035,
670
+ "rewards/margins": 0.1988353729248047,
671
+ "rewards/rejected": -0.23154623806476593,
672
+ "step": 400
673
+ },
674
+ {
675
+ "epoch": 0.8374771002355405,
676
+ "eval_logits/chosen": 4.645900249481201,
677
+ "eval_logits/rejected": 4.999230861663818,
678
+ "eval_logps/chosen": -443.68084716796875,
679
+ "eval_logps/rejected": -378.0348205566406,
680
+ "eval_loss": 0.6756832003593445,
681
+ "eval_rewards/accuracies": 0.5992063283920288,
682
+ "eval_rewards/chosen": -0.08822782337665558,
683
+ "eval_rewards/margins": 0.19070643186569214,
684
+ "eval_rewards/rejected": -0.2789342403411865,
685
+ "eval_runtime": 21.4973,
686
+ "eval_samples_per_second": 93.035,
687
+ "eval_steps_per_second": 2.931,
688
+ "step": 400
689
+ },
690
+ {
691
+ "epoch": 0.8584140277414289,
692
+ "grad_norm": 80.32344495768098,
693
+ "learning_rate": 7.043713221597773e-07,
694
+ "logits/chosen": 4.9558234214782715,
695
+ "logits/rejected": 5.171336650848389,
696
+ "logps/chosen": -464.4634704589844,
697
+ "logps/rejected": -378.52130126953125,
698
+ "loss": 0.6691,
699
+ "rewards/accuracies": 0.5687500238418579,
700
+ "rewards/chosen": -0.028427015990018845,
701
+ "rewards/margins": 0.16248683631420135,
702
+ "rewards/rejected": -0.1909138560295105,
703
+ "step": 410
704
+ },
705
+ {
706
+ "epoch": 0.8793509552473174,
707
+ "grad_norm": 38.003974257278905,
708
+ "learning_rate": 6.875296018047809e-07,
709
+ "logits/chosen": 5.062918663024902,
710
+ "logits/rejected": 5.093894958496094,
711
+ "logps/chosen": -414.597900390625,
712
+ "logps/rejected": -392.76422119140625,
713
+ "loss": 0.6778,
714
+ "rewards/accuracies": 0.612500011920929,
715
+ "rewards/chosen": -0.05205658823251724,
716
+ "rewards/margins": 0.166357159614563,
717
+ "rewards/rejected": -0.21841374039649963,
718
+ "step": 420
719
+ },
720
+ {
721
+ "epoch": 0.9002878827532059,
722
+ "grad_norm": 33.39326538286459,
723
+ "learning_rate": 6.704364923285857e-07,
724
+ "logits/chosen": 4.783626556396484,
725
+ "logits/rejected": 5.061443328857422,
726
+ "logps/chosen": -454.7694396972656,
727
+ "logps/rejected": -349.71099853515625,
728
+ "loss": 0.6613,
729
+ "rewards/accuracies": 0.6312500238418579,
730
+ "rewards/chosen": -0.03129550814628601,
731
+ "rewards/margins": 0.2672887146472931,
732
+ "rewards/rejected": -0.2985842227935791,
733
+ "step": 430
734
+ },
735
+ {
736
+ "epoch": 0.9212248102590945,
737
+ "grad_norm": 33.88594593881104,
738
+ "learning_rate": 6.531149075630796e-07,
739
+ "logits/chosen": 4.762629985809326,
740
+ "logits/rejected": 4.992688179016113,
741
+ "logps/chosen": -422.49639892578125,
742
+ "logps/rejected": -342.6626892089844,
743
+ "loss": 0.6829,
744
+ "rewards/accuracies": 0.574999988079071,
745
+ "rewards/chosen": -0.08267354220151901,
746
+ "rewards/margins": 0.18921074271202087,
747
+ "rewards/rejected": -0.2718842923641205,
748
+ "step": 440
749
+ },
750
+ {
751
+ "epoch": 0.942161737764983,
752
+ "grad_norm": 34.194378360359096,
753
+ "learning_rate": 6.355880676182085e-07,
754
+ "logits/chosen": 4.86130952835083,
755
+ "logits/rejected": 5.088041305541992,
756
+ "logps/chosen": -423.82366943359375,
757
+ "logps/rejected": -386.20172119140625,
758
+ "loss": 0.6777,
759
+ "rewards/accuracies": 0.637499988079071,
760
+ "rewards/chosen": -0.029518108814954758,
761
+ "rewards/margins": 0.2812921106815338,
762
+ "rewards/rejected": -0.3108102083206177,
763
+ "step": 450
764
+ },
765
+ {
766
+ "epoch": 0.9630986652708715,
767
+ "grad_norm": 35.220161431379815,
768
+ "learning_rate": 6.178794677547137e-07,
769
+ "logits/chosen": 4.96859073638916,
770
+ "logits/rejected": 5.295912265777588,
771
+ "logps/chosen": -408.28228759765625,
772
+ "logps/rejected": -337.6819763183594,
773
+ "loss": 0.6573,
774
+ "rewards/accuracies": 0.65625,
775
+ "rewards/chosen": -0.02989841438829899,
776
+ "rewards/margins": 0.3131854832172394,
777
+ "rewards/rejected": -0.343083918094635,
778
+ "step": 460
779
+ },
780
+ {
781
+ "epoch": 0.98403559277676,
782
+ "grad_norm": 36.11741005068747,
783
+ "learning_rate": 6.000128468880222e-07,
784
+ "logits/chosen": 4.616504669189453,
785
+ "logits/rejected": 4.935946464538574,
786
+ "logps/chosen": -435.3017578125,
787
+ "logps/rejected": -375.13800048828125,
788
+ "loss": 0.6647,
789
+ "rewards/accuracies": 0.606249988079071,
790
+ "rewards/chosen": -0.11524273455142975,
791
+ "rewards/margins": 0.2546766698360443,
792
+ "rewards/rejected": -0.3699193596839905,
793
+ "step": 470
794
+ },
795
+ {
796
+ "epoch": 1.0049725202826485,
797
+ "grad_norm": 32.471857091487834,
798
+ "learning_rate": 5.820121557655108e-07,
799
+ "logits/chosen": 4.9493536949157715,
800
+ "logits/rejected": 5.226868152618408,
801
+ "logps/chosen": -423.6285095214844,
802
+ "logps/rejected": -362.1949768066406,
803
+ "loss": 0.6629,
804
+ "rewards/accuracies": 0.637499988079071,
805
+ "rewards/chosen": 0.05261852219700813,
806
+ "rewards/margins": 0.3280298113822937,
807
+ "rewards/rejected": -0.27541130781173706,
808
+ "step": 480
809
+ },
810
+ {
811
+ "epoch": 1.025909447788537,
812
+ "grad_norm": 39.51652905408664,
813
+ "learning_rate": 5.639015248598023e-07,
814
+ "logits/chosen": 4.762259006500244,
815
+ "logits/rejected": 5.021244525909424,
816
+ "logps/chosen": -424.96697998046875,
817
+ "logps/rejected": -342.76666259765625,
818
+ "loss": 0.6644,
819
+ "rewards/accuracies": 0.6000000238418579,
820
+ "rewards/chosen": -0.03716667741537094,
821
+ "rewards/margins": 0.2011403739452362,
822
+ "rewards/rejected": -0.23830704391002655,
823
+ "step": 490
824
+ },
825
+ {
826
+ "epoch": 1.0468463752944255,
827
+ "grad_norm": 34.43579926142672,
828
+ "learning_rate": 5.457052320211339e-07,
829
+ "logits/chosen": 4.543593406677246,
830
+ "logits/rejected": 4.786489009857178,
831
+ "logps/chosen": -434.46746826171875,
832
+ "logps/rejected": -367.75689697265625,
833
+ "loss": 0.6836,
834
+ "rewards/accuracies": 0.606249988079071,
835
+ "rewards/chosen": -0.08059108257293701,
836
+ "rewards/margins": 0.23346829414367676,
837
+ "rewards/rejected": -0.3140593469142914,
838
+ "step": 500
839
+ },
840
+ {
841
+ "epoch": 1.0468463752944255,
842
+ "eval_logits/chosen": 4.617003440856934,
843
+ "eval_logits/rejected": 4.9695563316345215,
844
+ "eval_logps/chosen": -443.69580078125,
845
+ "eval_logps/rejected": -378.1418762207031,
846
+ "eval_loss": 0.6708300113677979,
847
+ "eval_rewards/accuracies": 0.6349206566810608,
848
+ "eval_rewards/chosen": -0.09571509808301926,
849
+ "eval_rewards/margins": 0.23677198588848114,
850
+ "eval_rewards/rejected": -0.3324871063232422,
851
+ "eval_runtime": 20.9626,
852
+ "eval_samples_per_second": 95.408,
853
+ "eval_steps_per_second": 3.005,
854
+ "step": 500
855
+ },
856
+ {
857
+ "epoch": 1.067783302800314,
858
+ "grad_norm": 33.3201987416808,
859
+ "learning_rate": 5.274476699321637e-07,
860
+ "logits/chosen": 4.583409786224365,
861
+ "logits/rejected": 4.803020477294922,
862
+ "logps/chosen": -390.48565673828125,
863
+ "logps/rejected": -351.6776123046875,
864
+ "loss": 0.6779,
865
+ "rewards/accuracies": 0.6312500238418579,
866
+ "rewards/chosen": -0.0327589213848114,
867
+ "rewards/margins": 0.32544368505477905,
868
+ "rewards/rejected": -0.35820263624191284,
869
+ "step": 510
870
+ },
871
+ {
872
+ "epoch": 1.0887202303062025,
873
+ "grad_norm": 30.742730814185435,
874
+ "learning_rate": 5.091533134088387e-07,
875
+ "logits/chosen": 4.493949890136719,
876
+ "logits/rejected": 4.9839911460876465,
877
+ "logps/chosen": -383.7958984375,
878
+ "logps/rejected": -354.36480712890625,
879
+ "loss": 0.656,
880
+ "rewards/accuracies": 0.5375000238418579,
881
+ "rewards/chosen": -0.07341745495796204,
882
+ "rewards/margins": 0.19709812104701996,
883
+ "rewards/rejected": -0.2705155909061432,
884
+ "step": 520
885
+ },
886
+ {
887
+ "epoch": 1.109657157812091,
888
+ "grad_norm": 34.05900047947194,
889
+ "learning_rate": 4.908466865911614e-07,
890
+ "logits/chosen": 4.793222904205322,
891
+ "logits/rejected": 5.078155517578125,
892
+ "logps/chosen": -401.0002746582031,
893
+ "logps/rejected": -340.4061279296875,
894
+ "loss": 0.6618,
895
+ "rewards/accuracies": 0.6499999761581421,
896
+ "rewards/chosen": -0.05051114410161972,
897
+ "rewards/margins": 0.27152642607688904,
898
+ "rewards/rejected": -0.32203757762908936,
899
+ "step": 530
900
+ },
901
+ {
902
+ "epoch": 1.1305940853179797,
903
+ "grad_norm": 30.483486401054424,
904
+ "learning_rate": 4.7255233006783624e-07,
905
+ "logits/chosen": 4.857717990875244,
906
+ "logits/rejected": 5.0497636795043945,
907
+ "logps/chosen": -375.65362548828125,
908
+ "logps/rejected": -330.26165771484375,
909
+ "loss": 0.6544,
910
+ "rewards/accuracies": 0.7124999761581421,
911
+ "rewards/chosen": -0.02352207899093628,
912
+ "rewards/margins": 0.35729408264160156,
913
+ "rewards/rejected": -0.38081610202789307,
914
+ "step": 540
915
+ },
916
+ {
917
+ "epoch": 1.151531012823868,
918
+ "grad_norm": 35.09603470685652,
919
+ "learning_rate": 4.5429476797886617e-07,
920
+ "logits/chosen": 4.932369232177734,
921
+ "logits/rejected": 5.050224781036377,
922
+ "logps/chosen": -430.0126953125,
923
+ "logps/rejected": -331.1691589355469,
924
+ "loss": 0.6599,
925
+ "rewards/accuracies": 0.6312500238418579,
926
+ "rewards/chosen": -0.0060789333656430244,
927
+ "rewards/margins": 0.25288811326026917,
928
+ "rewards/rejected": -0.25896701216697693,
929
+ "step": 550
930
+ },
931
+ {
932
+ "epoch": 1.1724679403297567,
933
+ "grad_norm": 40.64422646125966,
934
+ "learning_rate": 4.3609847514019763e-07,
935
+ "logits/chosen": 4.637743949890137,
936
+ "logits/rejected": 5.000674724578857,
937
+ "logps/chosen": -420.3258361816406,
938
+ "logps/rejected": -362.2751159667969,
939
+ "loss": 0.6718,
940
+ "rewards/accuracies": 0.574999988079071,
941
+ "rewards/chosen": -0.0094971414655447,
942
+ "rewards/margins": 0.22678783535957336,
943
+ "rewards/rejected": -0.23628497123718262,
944
+ "step": 560
945
+ },
946
+ {
947
+ "epoch": 1.193404867835645,
948
+ "grad_norm": 32.638009640148645,
949
+ "learning_rate": 4.179878442344892e-07,
950
+ "logits/chosen": 4.855754375457764,
951
+ "logits/rejected": 4.871184349060059,
952
+ "logps/chosen": -384.08660888671875,
953
+ "logps/rejected": -371.4262390136719,
954
+ "loss": 0.6766,
955
+ "rewards/accuracies": 0.625,
956
+ "rewards/chosen": -0.05095939710736275,
957
+ "rewards/margins": 0.28148993849754333,
958
+ "rewards/rejected": -0.332449346780777,
959
+ "step": 570
960
+ },
961
+ {
962
+ "epoch": 1.2143417953415336,
963
+ "grad_norm": 35.519971577107064,
964
+ "learning_rate": 3.9998715311197783e-07,
965
+ "logits/chosen": 4.73850679397583,
966
+ "logits/rejected": 5.173120021820068,
967
+ "logps/chosen": -414.8775329589844,
968
+ "logps/rejected": -341.5818786621094,
969
+ "loss": 0.6508,
970
+ "rewards/accuracies": 0.6000000238418579,
971
+ "rewards/chosen": -0.09668377041816711,
972
+ "rewards/margins": 0.25211262702941895,
973
+ "rewards/rejected": -0.34879642724990845,
974
+ "step": 580
975
+ },
976
+ {
977
+ "epoch": 1.235278722847422,
978
+ "grad_norm": 34.20580765627037,
979
+ "learning_rate": 3.821205322452863e-07,
980
+ "logits/chosen": 4.916988372802734,
981
+ "logits/rejected": 5.1998610496521,
982
+ "logps/chosen": -448.5626525878906,
983
+ "logps/rejected": -367.84027099609375,
984
+ "loss": 0.644,
985
+ "rewards/accuracies": 0.625,
986
+ "rewards/chosen": -0.07886572182178497,
987
+ "rewards/margins": 0.3578983247280121,
988
+ "rewards/rejected": -0.43676406145095825,
989
+ "step": 590
990
+ },
991
+ {
992
+ "epoch": 1.2562156503533106,
993
+ "grad_norm": 33.854286929995176,
994
+ "learning_rate": 3.6441193238179146e-07,
995
+ "logits/chosen": 4.852269649505615,
996
+ "logits/rejected": 4.903324127197266,
997
+ "logps/chosen": -446.4149475097656,
998
+ "logps/rejected": -423.3356018066406,
999
+ "loss": 0.6349,
1000
+ "rewards/accuracies": 0.581250011920929,
1001
+ "rewards/chosen": -0.14010193943977356,
1002
+ "rewards/margins": 0.15067996084690094,
1003
+ "rewards/rejected": -0.2907818853855133,
1004
+ "step": 600
1005
+ },
1006
+ {
1007
+ "epoch": 1.2562156503533106,
1008
+ "eval_logits/chosen": 4.62031364440918,
1009
+ "eval_logits/rejected": 4.9707465171813965,
1010
+ "eval_logps/chosen": -443.61212158203125,
1011
+ "eval_logps/rejected": -378.1197204589844,
1012
+ "eval_loss": 0.6720485091209412,
1013
+ "eval_rewards/accuracies": 0.5992063283920288,
1014
+ "eval_rewards/chosen": -0.053870752453804016,
1015
+ "eval_rewards/margins": 0.267531156539917,
1016
+ "eval_rewards/rejected": -0.321401983499527,
1017
+ "eval_runtime": 20.8046,
1018
+ "eval_samples_per_second": 96.133,
1019
+ "eval_steps_per_second": 3.028,
1020
+ "step": 600
1021
+ },
1022
+ {
1023
+ "epoch": 1.2771525778591992,
1024
+ "grad_norm": 36.085842973391074,
1025
+ "learning_rate": 3.4688509243692034e-07,
1026
+ "logits/chosen": 4.767918586730957,
1027
+ "logits/rejected": 4.757430553436279,
1028
+ "logps/chosen": -407.41668701171875,
1029
+ "logps/rejected": -317.3873596191406,
1030
+ "loss": 0.6402,
1031
+ "rewards/accuracies": 0.643750011920929,
1032
+ "rewards/chosen": -0.08183420449495316,
1033
+ "rewards/margins": 0.33883604407310486,
1034
+ "rewards/rejected": -0.42067021131515503,
1035
+ "step": 610
1036
+ },
1037
+ {
1038
+ "epoch": 1.2980895053650876,
1039
+ "grad_norm": 29.698333183198105,
1040
+ "learning_rate": 3.295635076714144e-07,
1041
+ "logits/chosen": 5.085806846618652,
1042
+ "logits/rejected": 5.415268898010254,
1043
+ "logps/chosen": -395.627685546875,
1044
+ "logps/rejected": -331.7653503417969,
1045
+ "loss": 0.6266,
1046
+ "rewards/accuracies": 0.65625,
1047
+ "rewards/chosen": -0.09060301631689072,
1048
+ "rewards/margins": 0.3094441294670105,
1049
+ "rewards/rejected": -0.4000471234321594,
1050
+ "step": 620
1051
+ },
1052
+ {
1053
+ "epoch": 1.3190264328709762,
1054
+ "grad_norm": 35.208773349468885,
1055
+ "learning_rate": 3.12470398195219e-07,
1056
+ "logits/chosen": 4.828533172607422,
1057
+ "logits/rejected": 4.925856113433838,
1058
+ "logps/chosen": -418.5848083496094,
1059
+ "logps/rejected": -376.3353576660156,
1060
+ "loss": 0.6486,
1061
+ "rewards/accuracies": 0.6312500238418579,
1062
+ "rewards/chosen": 0.06391973793506622,
1063
+ "rewards/margins": 0.44674786925315857,
1064
+ "rewards/rejected": -0.3828281760215759,
1065
+ "step": 630
1066
+ },
1067
+ {
1068
+ "epoch": 1.3399633603768648,
1069
+ "grad_norm": 29.673309842493335,
1070
+ "learning_rate": 2.956286778402226e-07,
1071
+ "logits/chosen": 4.896113872528076,
1072
+ "logits/rejected": 5.183098793029785,
1073
+ "logps/chosen": -394.4980773925781,
1074
+ "logps/rejected": -374.76422119140625,
1075
+ "loss": 0.6394,
1076
+ "rewards/accuracies": 0.6312500238418579,
1077
+ "rewards/chosen": -0.04267222806811333,
1078
+ "rewards/margins": 0.2913575768470764,
1079
+ "rewards/rejected": -0.33402982354164124,
1080
+ "step": 640
1081
+ },
1082
+ {
1083
+ "epoch": 1.3609002878827532,
1084
+ "grad_norm": 35.03684415648848,
1085
+ "learning_rate": 2.7906092344356826e-07,
1086
+ "logits/chosen": 4.610795021057129,
1087
+ "logits/rejected": 4.8373188972473145,
1088
+ "logps/chosen": -379.4288024902344,
1089
+ "logps/rejected": -345.05596923828125,
1090
+ "loss": 0.6646,
1091
+ "rewards/accuracies": 0.581250011920929,
1092
+ "rewards/chosen": -0.1311950385570526,
1093
+ "rewards/margins": 0.2357216328382492,
1094
+ "rewards/rejected": -0.366916686296463,
1095
+ "step": 650
1096
+ },
1097
+ {
1098
+ "epoch": 1.3818372153886418,
1099
+ "grad_norm": 33.06984951084542,
1100
+ "learning_rate": 2.6278934458271996e-07,
1101
+ "logits/chosen": 4.830328941345215,
1102
+ "logits/rejected": 5.017812252044678,
1103
+ "logps/chosen": -377.4278564453125,
1104
+ "logps/rejected": -343.86529541015625,
1105
+ "loss": 0.6613,
1106
+ "rewards/accuracies": 0.5625,
1107
+ "rewards/chosen": -0.10503290593624115,
1108
+ "rewards/margins": 0.11762680858373642,
1109
+ "rewards/rejected": -0.22265975177288055,
1110
+ "step": 660
1111
+ },
1112
+ {
1113
+ "epoch": 1.4027741428945302,
1114
+ "grad_norm": 31.761556922593446,
1115
+ "learning_rate": 2.468357538028487e-07,
1116
+ "logits/chosen": 4.728631496429443,
1117
+ "logits/rejected": 4.90619421005249,
1118
+ "logps/chosen": -413.2724609375,
1119
+ "logps/rejected": -346.9877624511719,
1120
+ "loss": 0.6393,
1121
+ "rewards/accuracies": 0.637499988079071,
1122
+ "rewards/chosen": -0.03663766756653786,
1123
+ "rewards/margins": 0.2855125069618225,
1124
+ "rewards/rejected": -0.32215017080307007,
1125
+ "step": 670
1126
+ },
1127
+ {
1128
+ "epoch": 1.4237110704004188,
1129
+ "grad_norm": 34.93162849349177,
1130
+ "learning_rate": 2.312215373764551e-07,
1131
+ "logits/chosen": 4.728277206420898,
1132
+ "logits/rejected": 5.018845558166504,
1133
+ "logps/chosen": -421.8961486816406,
1134
+ "logps/rejected": -403.57354736328125,
1135
+ "loss": 0.6533,
1136
+ "rewards/accuracies": 0.606249988079071,
1137
+ "rewards/chosen": -0.1260446161031723,
1138
+ "rewards/margins": 0.2200162708759308,
1139
+ "rewards/rejected": -0.3460609018802643,
1140
+ "step": 680
1141
+ },
1142
+ {
1143
+ "epoch": 1.4446479979063072,
1144
+ "grad_norm": 33.66523822793528,
1145
+ "learning_rate": 2.1596762663442213e-07,
1146
+ "logits/chosen": 4.863284111022949,
1147
+ "logits/rejected": 4.840500354766846,
1148
+ "logps/chosen": -422.4331970214844,
1149
+ "logps/rejected": -355.96868896484375,
1150
+ "loss": 0.6477,
1151
+ "rewards/accuracies": 0.625,
1152
+ "rewards/chosen": -0.10065089166164398,
1153
+ "rewards/margins": 0.24511468410491943,
1154
+ "rewards/rejected": -0.3457655906677246,
1155
+ "step": 690
1156
+ },
1157
+ {
1158
+ "epoch": 1.4655849254121958,
1159
+ "grad_norm": 34.48257400044076,
1160
+ "learning_rate": 2.0109446990692963e-07,
1161
+ "logits/chosen": 4.709015846252441,
1162
+ "logits/rejected": 4.914425849914551,
1163
+ "logps/chosen": -452.9461364746094,
1164
+ "logps/rejected": -442.56658935546875,
1165
+ "loss": 0.6427,
1166
+ "rewards/accuracies": 0.65625,
1167
+ "rewards/chosen": 0.08032918721437454,
1168
+ "rewards/margins": 0.39584842324256897,
1169
+ "rewards/rejected": -0.3155192732810974,
1170
+ "step": 700
1171
+ },
1172
+ {
1173
+ "epoch": 1.4655849254121958,
1174
+ "eval_logits/chosen": 4.592012882232666,
1175
+ "eval_logits/rejected": 4.943046569824219,
1176
+ "eval_logps/chosen": -443.6796875,
1177
+ "eval_logps/rejected": -378.1680908203125,
1178
+ "eval_loss": 0.6795812845230103,
1179
+ "eval_rewards/accuracies": 0.60317462682724,
1180
+ "eval_rewards/chosen": -0.08766676485538483,
1181
+ "eval_rewards/margins": 0.25792673230171204,
1182
+ "eval_rewards/rejected": -0.34559354186058044,
1183
+ "eval_runtime": 21.1978,
1184
+ "eval_samples_per_second": 94.349,
1185
+ "eval_steps_per_second": 2.972,
1186
+ "step": 700
1187
+ },
1188
+ {
1189
+ "epoch": 1.4865218529180844,
1190
+ "grad_norm": 36.350524448045036,
1191
+ "learning_rate": 1.8662200511184872e-07,
1192
+ "logits/chosen": 4.871232509613037,
1193
+ "logits/rejected": 4.886293411254883,
1194
+ "logps/chosen": -417.8133850097656,
1195
+ "logps/rejected": -384.177490234375,
1196
+ "loss": 0.6669,
1197
+ "rewards/accuracies": 0.6000000238418579,
1198
+ "rewards/chosen": -0.12399878352880478,
1199
+ "rewards/margins": 0.27855515480041504,
1200
+ "rewards/rejected": -0.40255388617515564,
1201
+ "step": 710
1202
+ },
1203
+ {
1204
+ "epoch": 1.5074587804239727,
1205
+ "grad_norm": 34.52058371975813,
1206
+ "learning_rate": 1.725696330273575e-07,
1207
+ "logits/chosen": 4.8079633712768555,
1208
+ "logits/rejected": 5.118483543395996,
1209
+ "logps/chosen": -433.02032470703125,
1210
+ "logps/rejected": -383.21539306640625,
1211
+ "loss": 0.6234,
1212
+ "rewards/accuracies": 0.65625,
1213
+ "rewards/chosen": 0.049599818885326385,
1214
+ "rewards/margins": 0.36388009786605835,
1215
+ "rewards/rejected": -0.31428030133247375,
1216
+ "step": 720
1217
+ },
1218
+ {
1219
+ "epoch": 1.5283957079298613,
1220
+ "grad_norm": 36.62094520000859,
1221
+ "learning_rate": 1.589561912846089e-07,
1222
+ "logits/chosen": 4.67967414855957,
1223
+ "logits/rejected": 4.974714756011963,
1224
+ "logps/chosen": -402.2828063964844,
1225
+ "logps/rejected": -343.87939453125,
1226
+ "loss": 0.6419,
1227
+ "rewards/accuracies": 0.675000011920929,
1228
+ "rewards/chosen": -0.02172028087079525,
1229
+ "rewards/margins": 0.3966042995452881,
1230
+ "rewards/rejected": -0.4183245599269867,
1231
+ "step": 730
1232
+ },
1233
+ {
1234
+ "epoch": 1.54933263543575,
1235
+ "grad_norm": 34.85140828972076,
1236
+ "learning_rate": 1.4579992911531496e-07,
1237
+ "logits/chosen": 4.999066352844238,
1238
+ "logits/rejected": 5.089913845062256,
1239
+ "logps/chosen": -442.08538818359375,
1240
+ "logps/rejected": -387.76953125,
1241
+ "loss": 0.6641,
1242
+ "rewards/accuracies": 0.6000000238418579,
1243
+ "rewards/chosen": 0.024145543575286865,
1244
+ "rewards/margins": 0.3119828999042511,
1245
+ "rewards/rejected": -0.28783735632896423,
1246
+ "step": 740
1247
+ },
1248
+ {
1249
+ "epoch": 1.5702695629416383,
1250
+ "grad_norm": 33.55559408410901,
1251
+ "learning_rate": 1.3311848288809813e-07,
1252
+ "logits/chosen": 4.944571018218994,
1253
+ "logits/rejected": 4.949963569641113,
1254
+ "logps/chosen": -422.9165954589844,
1255
+ "logps/rejected": -378.2356262207031,
1256
+ "loss": 0.6431,
1257
+ "rewards/accuracies": 0.581250011920929,
1258
+ "rewards/chosen": -0.10791780799627304,
1259
+ "rewards/margins": 0.16808216273784637,
1260
+ "rewards/rejected": -0.2759999632835388,
1261
+ "step": 750
1262
+ },
1263
+ {
1264
+ "epoch": 1.5912064904475267,
1265
+ "grad_norm": 33.284252993746314,
1266
+ "learning_rate": 1.209288524664029e-07,
1267
+ "logits/chosen": 4.269396781921387,
1268
+ "logits/rejected": 4.640176296234131,
1269
+ "logps/chosen": -513.432861328125,
1270
+ "logps/rejected": -464.742431640625,
1271
+ "loss": 0.6505,
1272
+ "rewards/accuracies": 0.59375,
1273
+ "rewards/chosen": 0.00313050439581275,
1274
+ "rewards/margins": 0.3427557051181793,
1275
+ "rewards/rejected": -0.33962517976760864,
1276
+ "step": 760
1277
+ },
1278
+ {
1279
+ "epoch": 1.6121434179534153,
1280
+ "grad_norm": 33.301123590813035,
1281
+ "learning_rate": 1.0924737841966497e-07,
1282
+ "logits/chosen": 4.588865756988525,
1283
+ "logits/rejected": 4.75103235244751,
1284
+ "logps/chosen": -465.42059326171875,
1285
+ "logps/rejected": -370.064697265625,
1286
+ "loss": 0.653,
1287
+ "rewards/accuracies": 0.668749988079071,
1288
+ "rewards/chosen": -0.01498096901923418,
1289
+ "rewards/margins": 0.34728002548217773,
1290
+ "rewards/rejected": -0.3622610569000244,
1291
+ "step": 770
1292
+ },
1293
+ {
1294
+ "epoch": 1.633080345459304,
1295
+ "grad_norm": 33.707974100314466,
1296
+ "learning_rate": 9.808972011828054e-08,
1297
+ "logits/chosen": 4.657374382019043,
1298
+ "logits/rejected": 5.004950523376465,
1299
+ "logps/chosen": -452.0787048339844,
1300
+ "logps/rejected": -383.26824951171875,
1301
+ "loss": 0.6419,
1302
+ "rewards/accuracies": 0.6625000238418579,
1303
+ "rewards/chosen": 0.06604544818401337,
1304
+ "rewards/margins": 0.4590230882167816,
1305
+ "rewards/rejected": -0.39297762513160706,
1306
+ "step": 780
1307
+ },
1308
+ {
1309
+ "epoch": 1.6540172729651923,
1310
+ "grad_norm": 36.400512256730096,
1311
+ "learning_rate": 8.747083474174527e-08,
1312
+ "logits/chosen": 4.775164604187012,
1313
+ "logits/rejected": 5.237417221069336,
1314
+ "logps/chosen": -431.0052185058594,
1315
+ "logps/rejected": -372.1168212890625,
1316
+ "loss": 0.6398,
1317
+ "rewards/accuracies": 0.668749988079071,
1318
+ "rewards/chosen": -0.017582783475518227,
1319
+ "rewards/margins": 0.35258156061172485,
1320
+ "rewards/rejected": -0.37016433477401733,
1321
+ "step": 790
1322
+ },
1323
+ {
1324
+ "epoch": 1.674954200471081,
1325
+ "grad_norm": 29.96252260731642,
1326
+ "learning_rate": 7.740495722810269e-08,
1327
+ "logits/chosen": 4.998331546783447,
1328
+ "logits/rejected": 4.909043312072754,
1329
+ "logps/chosen": -489.783447265625,
1330
+ "logps/rejected": -415.0606384277344,
1331
+ "loss": 0.6128,
1332
+ "rewards/accuracies": 0.606249988079071,
1333
+ "rewards/chosen": 0.04877934604883194,
1334
+ "rewards/margins": 0.3542923033237457,
1335
+ "rewards/rejected": -0.3055129647254944,
1336
+ "step": 800
1337
+ },
1338
+ {
1339
+ "epoch": 1.674954200471081,
1340
+ "eval_logits/chosen": 4.6106181144714355,
1341
+ "eval_logits/rejected": 4.968925476074219,
1342
+ "eval_logps/chosen": -443.625244140625,
1343
+ "eval_logps/rejected": -378.2127990722656,
1344
+ "eval_loss": 0.6703739166259766,
1345
+ "eval_rewards/accuracies": 0.6071428656578064,
1346
+ "eval_rewards/chosen": -0.06042463704943657,
1347
+ "eval_rewards/margins": 0.30752548575401306,
1348
+ "eval_rewards/rejected": -0.3679501414299011,
1349
+ "eval_runtime": 21.1621,
1350
+ "eval_samples_per_second": 94.509,
1351
+ "eval_steps_per_second": 2.977,
1352
+ "step": 800
1353
+ },
1354
+ {
1355
+ "epoch": 1.6958911279769695,
1356
+ "grad_norm": 34.07376933070953,
1357
+ "learning_rate": 6.790558119157597e-08,
1358
+ "logits/chosen": 4.842529773712158,
1359
+ "logits/rejected": 4.945174217224121,
1360
+ "logps/chosen": -446.68682861328125,
1361
+ "logps/rejected": -379.9209899902344,
1362
+ "loss": 0.6409,
1363
+ "rewards/accuracies": 0.668749988079071,
1364
+ "rewards/chosen": 0.010895573534071445,
1365
+ "rewards/margins": 0.47363200783729553,
1366
+ "rewards/rejected": -0.46273642778396606,
1367
+ "step": 810
1368
+ },
1369
+ {
1370
+ "epoch": 1.7168280554828579,
1371
+ "grad_norm": 31.41564508164701,
1372
+ "learning_rate": 5.898544083397e-08,
1373
+ "logits/chosen": 4.57013463973999,
1374
+ "logits/rejected": 4.8762030601501465,
1375
+ "logps/chosen": -459.298583984375,
1376
+ "logps/rejected": -376.189208984375,
1377
+ "loss": 0.6381,
1378
+ "rewards/accuracies": 0.625,
1379
+ "rewards/chosen": -0.04928427189588547,
1380
+ "rewards/margins": 0.33450883626937866,
1381
+ "rewards/rejected": -0.38379308581352234,
1382
+ "step": 820
1383
+ },
1384
+ {
1385
+ "epoch": 1.7377649829887463,
1386
+ "grad_norm": 38.55984096337612,
1387
+ "learning_rate": 5.065649387408705e-08,
1388
+ "logits/chosen": 4.863150596618652,
1389
+ "logits/rejected": 4.996617317199707,
1390
+ "logps/chosen": -405.2935485839844,
1391
+ "logps/rejected": -383.06756591796875,
1392
+ "loss": 0.6587,
1393
+ "rewards/accuracies": 0.6187499761581421,
1394
+ "rewards/chosen": -0.14060600101947784,
1395
+ "rewards/margins": 0.1646648645401001,
1396
+ "rewards/rejected": -0.30527088046073914,
1397
+ "step": 830
1398
+ },
1399
+ {
1400
+ "epoch": 1.7587019104946349,
1401
+ "grad_norm": 32.69891650352482,
1402
+ "learning_rate": 4.292990551804171e-08,
1403
+ "logits/chosen": 4.561503887176514,
1404
+ "logits/rejected": 4.661375522613525,
1405
+ "logps/chosen": -374.9468688964844,
1406
+ "logps/rejected": -359.5188293457031,
1407
+ "loss": 0.6426,
1408
+ "rewards/accuracies": 0.637499988079071,
1409
+ "rewards/chosen": -0.05613694339990616,
1410
+ "rewards/margins": 0.3448534607887268,
1411
+ "rewards/rejected": -0.40099042654037476,
1412
+ "step": 840
1413
+ },
1414
+ {
1415
+ "epoch": 1.7796388380005235,
1416
+ "grad_norm": 32.82316724445512,
1417
+ "learning_rate": 3.581603349196371e-08,
1418
+ "logits/chosen": 4.668177604675293,
1419
+ "logits/rejected": 5.044764518737793,
1420
+ "logps/chosen": -391.29534912109375,
1421
+ "logps/rejected": -374.1195068359375,
1422
+ "loss": 0.6501,
1423
+ "rewards/accuracies": 0.543749988079071,
1424
+ "rewards/chosen": -0.14754648506641388,
1425
+ "rewards/margins": 0.16757197678089142,
1426
+ "rewards/rejected": -0.3151184618473053,
1427
+ "step": 850
1428
+ },
1429
+ {
1430
+ "epoch": 1.8005757655064119,
1431
+ "grad_norm": 32.36442235611696,
1432
+ "learning_rate": 2.9324414157151367e-08,
1433
+ "logits/chosen": 4.706895351409912,
1434
+ "logits/rejected": 5.021437644958496,
1435
+ "logps/chosen": -417.41021728515625,
1436
+ "logps/rejected": -335.3275451660156,
1437
+ "loss": 0.6534,
1438
+ "rewards/accuracies": 0.6812499761581421,
1439
+ "rewards/chosen": -0.06291428953409195,
1440
+ "rewards/margins": 0.3059840798377991,
1441
+ "rewards/rejected": -0.36889833211898804,
1442
+ "step": 860
1443
+ },
1444
+ {
1445
+ "epoch": 1.8215126930123005,
1446
+ "grad_norm": 29.740377909388123,
1447
+ "learning_rate": 2.3463749726290284e-08,
1448
+ "logits/chosen": 4.696743965148926,
1449
+ "logits/rejected": 4.8797287940979,
1450
+ "logps/chosen": -477.77783203125,
1451
+ "logps/rejected": -390.98175048828125,
1452
+ "loss": 0.6614,
1453
+ "rewards/accuracies": 0.53125,
1454
+ "rewards/chosen": -0.07511474192142487,
1455
+ "rewards/margins": 0.17798468470573425,
1456
+ "rewards/rejected": -0.2530994415283203,
1457
+ "step": 870
1458
+ },
1459
+ {
1460
+ "epoch": 1.842449620518189,
1461
+ "grad_norm": 30.952090476967147,
1462
+ "learning_rate": 1.824189659787284e-08,
1463
+ "logits/chosen": 4.781184196472168,
1464
+ "logits/rejected": 5.032862663269043,
1465
+ "logps/chosen": -387.22906494140625,
1466
+ "logps/rejected": -360.9486389160156,
1467
+ "loss": 0.6618,
1468
+ "rewards/accuracies": 0.5874999761581421,
1469
+ "rewards/chosen": -0.11400938034057617,
1470
+ "rewards/margins": 0.21471650898456573,
1471
+ "rewards/rejected": -0.3287258744239807,
1472
+ "step": 880
1473
+ },
1474
+ {
1475
+ "epoch": 1.8633865480240774,
1476
+ "grad_norm": 31.64887361264221,
1477
+ "learning_rate": 1.3665854824458035e-08,
1478
+ "logits/chosen": 4.322469234466553,
1479
+ "logits/rejected": 4.672883033752441,
1480
+ "logps/chosen": -445.35699462890625,
1481
+ "logps/rejected": -390.5237731933594,
1482
+ "loss": 0.624,
1483
+ "rewards/accuracies": 0.6937500238418579,
1484
+ "rewards/chosen": -0.032880861312150955,
1485
+ "rewards/margins": 0.3543739914894104,
1486
+ "rewards/rejected": -0.38725486397743225,
1487
+ "step": 890
1488
+ },
1489
+ {
1490
+ "epoch": 1.8843234755299658,
1491
+ "grad_norm": 33.85502008551422,
1492
+ "learning_rate": 9.741758728888217e-09,
1493
+ "logits/chosen": 4.4365644454956055,
1494
+ "logits/rejected": 4.837357997894287,
1495
+ "logps/chosen": -472.887451171875,
1496
+ "logps/rejected": -367.82611083984375,
1497
+ "loss": 0.6474,
1498
+ "rewards/accuracies": 0.65625,
1499
+ "rewards/chosen": -0.05196143314242363,
1500
+ "rewards/margins": 0.3476230800151825,
1501
+ "rewards/rejected": -0.39958447217941284,
1502
+ "step": 900
1503
+ },
1504
+ {
1505
+ "epoch": 1.8843234755299658,
1506
+ "eval_logits/chosen": 4.5737175941467285,
1507
+ "eval_logits/rejected": 4.921082496643066,
1508
+ "eval_logps/chosen": -443.622314453125,
1509
+ "eval_logps/rejected": -378.2174377441406,
1510
+ "eval_loss": 0.6692253351211548,
1511
+ "eval_rewards/accuracies": 0.6269841194152832,
1512
+ "eval_rewards/chosen": -0.05897674709558487,
1513
+ "eval_rewards/margins": 0.31128397583961487,
1514
+ "eval_rewards/rejected": -0.37026071548461914,
1515
+ "eval_runtime": 21.3225,
1516
+ "eval_samples_per_second": 93.797,
1517
+ "eval_steps_per_second": 2.955,
1518
+ "step": 900
1519
+ },
1520
+ {
1521
+ "epoch": 1.9052604030358546,
1522
+ "grad_norm": 33.28147635399462,
1523
+ "learning_rate": 6.474868681043577e-09,
1524
+ "logits/chosen": 4.713411808013916,
1525
+ "logits/rejected": 4.913935661315918,
1526
+ "logps/chosen": -384.9287109375,
1527
+ "logps/rejected": -316.16265869140625,
1528
+ "loss": 0.6491,
1529
+ "rewards/accuracies": 0.6625000238418579,
1530
+ "rewards/chosen": -0.02645047940313816,
1531
+ "rewards/margins": 0.40163594484329224,
1532
+ "rewards/rejected": -0.42808642983436584,
1533
+ "step": 910
1534
+ },
1535
+ {
1536
+ "epoch": 1.926197330541743,
1537
+ "grad_norm": 35.875215811609834,
1538
+ "learning_rate": 3.869564046156459e-09,
1539
+ "logits/chosen": 4.6749348640441895,
1540
+ "logits/rejected": 4.898279190063477,
1541
+ "logps/chosen": -441.083740234375,
1542
+ "logps/rejected": -361.4406433105469,
1543
+ "loss": 0.6389,
1544
+ "rewards/accuracies": 0.6499999761581421,
1545
+ "rewards/chosen": -0.0031513571739196777,
1546
+ "rewards/margins": 0.41674357652664185,
1547
+ "rewards/rejected": -0.4198949337005615,
1548
+ "step": 920
1549
+ },
1550
+ {
1551
+ "epoch": 1.9471342580476314,
1552
+ "grad_norm": 32.946064523302205,
1553
+ "learning_rate": 1.929337314139412e-09,
1554
+ "logits/chosen": 4.862700462341309,
1555
+ "logits/rejected": 4.817538261413574,
1556
+ "logps/chosen": -429.21051025390625,
1557
+ "logps/rejected": -370.45745849609375,
1558
+ "loss": 0.6312,
1559
+ "rewards/accuracies": 0.6000000238418579,
1560
+ "rewards/chosen": -0.0764947384595871,
1561
+ "rewards/margins": 0.20215356349945068,
1562
+ "rewards/rejected": -0.2786482870578766,
1563
+ "step": 930
1564
+ },
1565
+ {
1566
+ "epoch": 1.96807118555352,
1567
+ "grad_norm": 37.53766060677335,
1568
+ "learning_rate": 6.567894177967325e-10,
1569
+ "logits/chosen": 5.056074142456055,
1570
+ "logits/rejected": 5.200203895568848,
1571
+ "logps/chosen": -382.3914489746094,
1572
+ "logps/rejected": -319.7542419433594,
1573
+ "loss": 0.6475,
1574
+ "rewards/accuracies": 0.625,
1575
+ "rewards/chosen": -0.012996235862374306,
1576
+ "rewards/margins": 0.2734270989894867,
1577
+ "rewards/rejected": -0.28642335534095764,
1578
+ "step": 940
1579
+ },
1580
+ {
1581
+ "epoch": 1.9890081130594086,
1582
+ "grad_norm": 32.67422145978211,
1583
+ "learning_rate": 5.3626246194704575e-11,
1584
+ "logits/chosen": 4.634739875793457,
1585
+ "logits/rejected": 4.890820503234863,
1586
+ "logps/chosen": -425.7994689941406,
1587
+ "logps/rejected": -344.5509033203125,
1588
+ "loss": 0.6372,
1589
+ "rewards/accuracies": 0.5562499761581421,
1590
+ "rewards/chosen": -0.18660762906074524,
1591
+ "rewards/margins": 0.23620739579200745,
1592
+ "rewards/rejected": -0.4228149950504303,
1593
+ "step": 950
1594
+ },
1595
+ {
1596
+ "epoch": 1.9973828840617638,
1597
+ "step": 954,
1598
+ "total_flos": 0.0,
1599
+ "train_loss": 0.675485389037702,
1600
+ "train_runtime": 5897.7907,
1601
+ "train_samples_per_second": 20.731,
1602
+ "train_steps_per_second": 0.162
1603
+ }
1604
+ ],
1605
+ "logging_steps": 10,
1606
+ "max_steps": 954,
1607
+ "num_input_tokens_seen": 0,
1608
+ "num_train_epochs": 2,
1609
+ "save_steps": 500,
1610
+ "stateful_callbacks": {
1611
+ "TrainerControl": {
1612
+ "args": {
1613
+ "should_epoch_stop": false,
1614
+ "should_evaluate": false,
1615
+ "should_log": false,
1616
+ "should_save": false,
1617
+ "should_training_stop": false
1618
+ },
1619
+ "attributes": {}
1620
+ }
1621
+ },
1622
+ "total_flos": 0.0,
1623
+ "train_batch_size": 2,
1624
+ "trial_name": null,
1625
+ "trial_params": null
1626
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90f508e5769b31070c2c8f82e7ecdce816d610763df89479b5258bc66ee8b357
3
+ size 6520
vocab.json ADDED
The diff for this file is too large to render. See raw diff