erbacher commited on
Commit
cb1844d
1 Parent(s): 3045eeb

Model save

Browse files
Files changed (5) hide show
  1. README.md +71 -0
  2. all_results.json +13 -0
  3. eval_results.json +8 -0
  4. train_results.json +8 -0
  5. trainer_state.json +156 -0
README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: HuggingFaceH4/zephyr-7b-beta
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: zephyr-rag-agent
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # zephyr-rag-agent
20
+
21
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 1.1623
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 2
44
+ - eval_batch_size: 2
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 2
48
+ - gradient_accumulation_steps: 16
49
+ - total_train_batch_size: 64
50
+ - total_eval_batch_size: 4
51
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 3
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.1559 | 0.99 | 27 | 1.1650 |
61
+ | 1.0887 | 1.98 | 54 | 1.1555 |
62
+ | 1.0566 | 2.97 | 81 | 1.1623 |
63
+
64
+
65
+ ### Framework versions
66
+
67
+ - PEFT 0.7.1
68
+ - Transformers 4.36.2
69
+ - Pytorch 2.2.2+cu121
70
+ - Datasets 2.14.6
71
+ - Tokenizers 0.15.2
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.97,
3
+ "eval_loss": 1.1623399257659912,
4
+ "eval_runtime": 279.5897,
5
+ "eval_samples": 716,
6
+ "eval_samples_per_second": 2.167,
7
+ "eval_steps_per_second": 0.544,
8
+ "train_loss": 1.1298090528558802,
9
+ "train_runtime": 14861.7425,
10
+ "train_samples": 1922,
11
+ "train_samples_per_second": 0.352,
12
+ "train_steps_per_second": 0.005
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.97,
3
+ "eval_loss": 1.1623399257659912,
4
+ "eval_runtime": 279.5897,
5
+ "eval_samples": 716,
6
+ "eval_samples_per_second": 2.167,
7
+ "eval_steps_per_second": 0.544
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.97,
3
+ "train_loss": 1.1298090528558802,
4
+ "train_runtime": 14861.7425,
5
+ "train_samples": 1922,
6
+ "train_samples_per_second": 0.352,
7
+ "train_steps_per_second": 0.005
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.965675057208238,
5
+ "eval_steps": 500,
6
+ "global_step": 81,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 2.2222222222222223e-05,
14
+ "loss": 1.4708,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.18,
19
+ "learning_rate": 0.00011111111111111112,
20
+ "loss": 1.4228,
21
+ "step": 5
22
+ },
23
+ {
24
+ "epoch": 0.37,
25
+ "learning_rate": 0.0001999048221581858,
26
+ "loss": 1.2725,
27
+ "step": 10
28
+ },
29
+ {
30
+ "epoch": 0.55,
31
+ "learning_rate": 0.00019659258262890683,
32
+ "loss": 1.2311,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.73,
37
+ "learning_rate": 0.00018870108331782217,
38
+ "loss": 1.1712,
39
+ "step": 20
40
+ },
41
+ {
42
+ "epoch": 0.92,
43
+ "learning_rate": 0.0001766044443118978,
44
+ "loss": 1.1559,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.99,
49
+ "eval_loss": 1.1649807691574097,
50
+ "eval_runtime": 283.9384,
51
+ "eval_samples_per_second": 2.134,
52
+ "eval_steps_per_second": 0.535,
53
+ "step": 27
54
+ },
55
+ {
56
+ "epoch": 1.1,
57
+ "learning_rate": 0.00016087614290087208,
58
+ "loss": 1.1485,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 1.28,
63
+ "learning_rate": 0.00014226182617406996,
64
+ "loss": 1.11,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 1.46,
69
+ "learning_rate": 0.00012164396139381029,
70
+ "loss": 1.1045,
71
+ "step": 40
72
+ },
73
+ {
74
+ "epoch": 1.65,
75
+ "learning_rate": 0.0001,
76
+ "loss": 1.0982,
77
+ "step": 45
78
+ },
79
+ {
80
+ "epoch": 1.83,
81
+ "learning_rate": 7.835603860618972e-05,
82
+ "loss": 1.0887,
83
+ "step": 50
84
+ },
85
+ {
86
+ "epoch": 1.98,
87
+ "eval_loss": 1.1555219888687134,
88
+ "eval_runtime": 283.2377,
89
+ "eval_samples_per_second": 2.14,
90
+ "eval_steps_per_second": 0.537,
91
+ "step": 54
92
+ },
93
+ {
94
+ "epoch": 2.01,
95
+ "learning_rate": 5.773817382593008e-05,
96
+ "loss": 1.0711,
97
+ "step": 55
98
+ },
99
+ {
100
+ "epoch": 2.2,
101
+ "learning_rate": 3.9123857099127936e-05,
102
+ "loss": 1.0764,
103
+ "step": 60
104
+ },
105
+ {
106
+ "epoch": 2.38,
107
+ "learning_rate": 2.339555568810221e-05,
108
+ "loss": 1.04,
109
+ "step": 65
110
+ },
111
+ {
112
+ "epoch": 2.56,
113
+ "learning_rate": 1.129891668217783e-05,
114
+ "loss": 1.0344,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 2.75,
119
+ "learning_rate": 3.40741737109318e-06,
120
+ "loss": 1.0026,
121
+ "step": 75
122
+ },
123
+ {
124
+ "epoch": 2.93,
125
+ "learning_rate": 9.517784181422019e-08,
126
+ "loss": 1.0566,
127
+ "step": 80
128
+ },
129
+ {
130
+ "epoch": 2.97,
131
+ "eval_loss": 1.1623399257659912,
132
+ "eval_runtime": 283.2011,
133
+ "eval_samples_per_second": 2.14,
134
+ "eval_steps_per_second": 0.537,
135
+ "step": 81
136
+ },
137
+ {
138
+ "epoch": 2.97,
139
+ "step": 81,
140
+ "total_flos": 173916983132160.0,
141
+ "train_loss": 1.1298090528558802,
142
+ "train_runtime": 14861.7425,
143
+ "train_samples_per_second": 0.352,
144
+ "train_steps_per_second": 0.005
145
+ }
146
+ ],
147
+ "logging_steps": 5,
148
+ "max_steps": 81,
149
+ "num_input_tokens_seen": 0,
150
+ "num_train_epochs": 3,
151
+ "save_steps": 500,
152
+ "total_flos": 173916983132160.0,
153
+ "train_batch_size": 2,
154
+ "trial_name": null,
155
+ "trial_params": null
156
+ }