scottsuk0306 commited on
Commit
163a0f2
1 Parent(s): 55cdbac

Model save

Browse files
README.md CHANGED
@@ -3,16 +3,11 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
6
- - alignment-handbook
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- - trl
11
- - sft
12
- - alignment-handbook
13
- - generated_from_trainer
14
  datasets:
15
- - scottsuk0306/DepthQA
16
  model-index:
17
  - name: zephyr-7b-stem-half
18
  results: []
@@ -23,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  # zephyr-7b-stem-half
25
 
26
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the scottsuk0306/DepthQA dataset.
27
  It achieves the following results on the evaluation set:
28
- - Loss: 0.1468
29
 
30
  ## Model description
31
 
@@ -45,13 +40,13 @@ More information needed
45
 
46
  The following hyperparameters were used during training:
47
  - learning_rate: 1e-05
48
- - train_batch_size: 16
49
- - eval_batch_size: 8
50
  - seed: 42
51
  - distributed_type: multi-GPU
52
- - num_devices: 8
53
- - total_train_batch_size: 128
54
- - total_eval_batch_size: 64
55
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
  - lr_scheduler_type: cosine
57
  - lr_scheduler_warmup_ratio: 0.03
@@ -61,15 +56,15 @@ The following hyperparameters were used during training:
61
 
62
  | Training Loss | Epoch | Step | Validation Loss |
63
  |:-------------:|:-----:|:----:|:---------------:|
64
- | 1.0647 | 1.0 | 1 | 1.0615 |
65
- | 1.0647 | 2.0 | 2 | 1.1875 |
66
- | 1.0647 | 3.0 | 3 | 0.9856 |
67
- | 1.0647 | 4.0 | 4 | 0.6196 |
68
- | 0.969 | 5.0 | 5 | 0.4429 |
69
- | 0.969 | 6.0 | 6 | 0.3590 |
70
- | 0.969 | 7.0 | 7 | 0.2403 |
71
- | 0.969 | 8.0 | 8 | 0.1860 |
72
- | 0.969 | 9.0 | 9 | 0.1558 |
73
 
74
 
75
  ### Framework versions
 
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
 
6
  - trl
7
  - sft
8
  - generated_from_trainer
 
 
 
 
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: zephyr-7b-stem-half
13
  results: []
 
18
 
19
  # zephyr-7b-stem-half
20
 
21
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.2678
24
 
25
  ## Model description
26
 
 
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 1e-05
43
+ - train_batch_size: 8
44
+ - eval_batch_size: 4
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
+ - num_devices: 4
48
+ - total_train_batch_size: 32
49
+ - total_eval_batch_size: 16
50
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
  - lr_scheduler_type: cosine
52
  - lr_scheduler_warmup_ratio: 0.03
 
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
+ | 1.1064 | 1.0 | 1 | 1.1164 |
60
+ | 1.1064 | 2.0 | 2 | 1.2343 |
61
+ | 1.1064 | 3.0 | 3 | 0.9905 |
62
+ | 1.1064 | 4.0 | 4 | 0.7123 |
63
+ | 1.0149 | 5.0 | 5 | 0.5843 |
64
+ | 1.0149 | 6.0 | 6 | 0.4350 |
65
+ | 1.0149 | 7.0 | 7 | 0.3496 |
66
+ | 1.0149 | 8.0 | 8 | 0.2988 |
67
+ | 1.0149 | 9.0 | 9 | 0.2678 |
68
 
69
 
70
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_loss": 0.14684411883354187,
4
- "eval_runtime": 4.0784,
5
- "eval_samples": 424,
6
- "eval_samples_per_second": 4.659,
7
- "eval_steps_per_second": 0.245,
8
- "total_flos": 8375186227200.0,
9
  "train_loss": 0.0,
10
- "train_runtime": 1.6437,
11
  "train_samples": 424,
12
- "train_samples_per_second": 115.592,
13
- "train_steps_per_second": 6.084
14
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "total_flos": 2093796556800.0,
 
 
 
 
 
4
  "train_loss": 0.0,
5
+ "train_runtime": 1.6028,
6
  "train_samples": 424,
7
+ "train_samples_per_second": 149.735,
8
+ "train_steps_per_second": 6.239
9
  }
config.json CHANGED
@@ -22,6 +22,6 @@
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.44.2",
25
- "use_cache": true,
26
  "vocab_size": 32000
27
  }
 
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.44.2",
25
+ "use_cache": false,
26
  "vocab_size": 32000
27
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c522bc6840c9722b850590aec0e876cfb8f74e694468807e6a2d18e18b7b73c
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2c5657be7d78e15de848fc62cc05376aa06fb48122aaab55e0299e97162f548
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50295b481270ed1b1d2971752bdceeaac1a9a2817a55ca11b22acf0e572c64f7
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a117ddb2c1d9547f5923e948171b5d03fee5fae7b643aed997d8517ea62928b
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc57123c5a8b1fafbc4eeb711ca05bfea52efeb0c79cf7ed37506369e6c8b734
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40e5a73ce77dd1c37f86398b7371480be2920392a322c421e4a183f827b3a193
3
  size 4540516344
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 10.0,
3
- "total_flos": 8375186227200.0,
4
  "train_loss": 0.0,
5
- "train_runtime": 1.6437,
6
  "train_samples": 424,
7
- "train_samples_per_second": 115.592,
8
- "train_steps_per_second": 6.084
9
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "total_flos": 2093796556800.0,
4
  "train_loss": 0.0,
5
+ "train_runtime": 1.6028,
6
  "train_samples": 424,
7
+ "train_samples_per_second": 149.735,
8
+ "train_steps_per_second": 6.239
9
  }
trainer_state.json CHANGED
@@ -10,105 +10,105 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "grad_norm": 25.3735801083628,
14
  "learning_rate": 1e-05,
15
- "loss": 1.0647,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_loss": 1.061476469039917,
21
- "eval_runtime": 2.8031,
22
- "eval_samples_per_second": 6.778,
23
- "eval_steps_per_second": 0.357,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 2.0,
28
- "eval_loss": 1.1875081062316895,
29
- "eval_runtime": 2.6772,
30
- "eval_samples_per_second": 7.097,
31
- "eval_steps_per_second": 0.374,
32
  "step": 2
33
  },
34
  {
35
  "epoch": 3.0,
36
- "eval_loss": 0.985565185546875,
37
- "eval_runtime": 2.6682,
38
- "eval_samples_per_second": 7.121,
39
- "eval_steps_per_second": 0.375,
40
  "step": 3
41
  },
42
  {
43
  "epoch": 4.0,
44
- "eval_loss": 0.6195911169052124,
45
- "eval_runtime": 2.6829,
46
- "eval_samples_per_second": 7.082,
47
- "eval_steps_per_second": 0.373,
48
  "step": 4
49
  },
50
  {
51
  "epoch": 5.0,
52
- "grad_norm": 13.767739837044488,
53
  "learning_rate": 5.8682408883346535e-06,
54
- "loss": 0.969,
55
  "step": 5
56
  },
57
  {
58
  "epoch": 5.0,
59
- "eval_loss": 0.4429333806037903,
60
- "eval_runtime": 2.6868,
61
- "eval_samples_per_second": 7.072,
62
- "eval_steps_per_second": 0.372,
63
  "step": 5
64
  },
65
  {
66
  "epoch": 6.0,
67
- "eval_loss": 0.3589998185634613,
68
- "eval_runtime": 2.6805,
69
- "eval_samples_per_second": 7.088,
70
- "eval_steps_per_second": 0.373,
71
  "step": 6
72
  },
73
  {
74
  "epoch": 7.0,
75
- "eval_loss": 0.24026596546173096,
76
- "eval_runtime": 2.671,
77
- "eval_samples_per_second": 7.113,
78
- "eval_steps_per_second": 0.374,
79
  "step": 7
80
  },
81
  {
82
  "epoch": 8.0,
83
- "eval_loss": 0.18603171408176422,
84
- "eval_runtime": 2.7105,
85
- "eval_samples_per_second": 7.01,
86
- "eval_steps_per_second": 0.369,
87
  "step": 8
88
  },
89
  {
90
  "epoch": 9.0,
91
- "eval_loss": 0.15581761300563812,
92
- "eval_runtime": 2.6748,
93
- "eval_samples_per_second": 7.103,
94
- "eval_steps_per_second": 0.374,
95
  "step": 9
96
  },
97
  {
98
  "epoch": 10.0,
99
- "grad_norm": 2.4836614664887184,
100
  "learning_rate": 0.0,
101
- "loss": 0.2782,
102
  "step": 10
103
  },
104
  {
105
  "epoch": 10.0,
106
  "step": 10,
107
- "total_flos": 8375186227200.0,
108
  "train_loss": 0.0,
109
- "train_runtime": 1.6437,
110
- "train_samples_per_second": 115.592,
111
- "train_steps_per_second": 6.084
112
  }
113
  ],
114
  "logging_steps": 5,
@@ -128,8 +128,8 @@
128
  "attributes": {}
129
  }
130
  },
131
- "total_flos": 8375186227200.0,
132
- "train_batch_size": 16,
133
  "trial_name": null,
134
  "trial_params": null
135
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "grad_norm": 17.570717480629625,
14
  "learning_rate": 1e-05,
15
+ "loss": 1.1064,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_loss": 1.116378664970398,
21
+ "eval_runtime": 2.7633,
22
+ "eval_samples_per_second": 8.685,
23
+ "eval_steps_per_second": 0.724,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 2.0,
28
+ "eval_loss": 1.234288215637207,
29
+ "eval_runtime": 2.7167,
30
+ "eval_samples_per_second": 8.834,
31
+ "eval_steps_per_second": 0.736,
32
  "step": 2
33
  },
34
  {
35
  "epoch": 3.0,
36
+ "eval_loss": 0.9905319213867188,
37
+ "eval_runtime": 2.7356,
38
+ "eval_samples_per_second": 8.773,
39
+ "eval_steps_per_second": 0.731,
40
  "step": 3
41
  },
42
  {
43
  "epoch": 4.0,
44
+ "eval_loss": 0.712308406829834,
45
+ "eval_runtime": 2.7104,
46
+ "eval_samples_per_second": 8.855,
47
+ "eval_steps_per_second": 0.738,
48
  "step": 4
49
  },
50
  {
51
  "epoch": 5.0,
52
+ "grad_norm": 22.309413100632455,
53
  "learning_rate": 5.8682408883346535e-06,
54
+ "loss": 1.0149,
55
  "step": 5
56
  },
57
  {
58
  "epoch": 5.0,
59
+ "eval_loss": 0.5842603445053101,
60
+ "eval_runtime": 2.7186,
61
+ "eval_samples_per_second": 8.828,
62
+ "eval_steps_per_second": 0.736,
63
  "step": 5
64
  },
65
  {
66
  "epoch": 6.0,
67
+ "eval_loss": 0.43497833609580994,
68
+ "eval_runtime": 2.7203,
69
+ "eval_samples_per_second": 8.823,
70
+ "eval_steps_per_second": 0.735,
71
  "step": 6
72
  },
73
  {
74
  "epoch": 7.0,
75
+ "eval_loss": 0.349648118019104,
76
+ "eval_runtime": 2.7121,
77
+ "eval_samples_per_second": 8.849,
78
+ "eval_steps_per_second": 0.737,
79
  "step": 7
80
  },
81
  {
82
  "epoch": 8.0,
83
+ "eval_loss": 0.298780232667923,
84
+ "eval_runtime": 2.7204,
85
+ "eval_samples_per_second": 8.822,
86
+ "eval_steps_per_second": 0.735,
87
  "step": 8
88
  },
89
  {
90
  "epoch": 9.0,
91
+ "eval_loss": 0.26783373951911926,
92
+ "eval_runtime": 2.7269,
93
+ "eval_samples_per_second": 8.801,
94
+ "eval_steps_per_second": 0.733,
95
  "step": 9
96
  },
97
  {
98
  "epoch": 10.0,
99
+ "grad_norm": 3.99720130224049,
100
  "learning_rate": 0.0,
101
+ "loss": 0.3844,
102
  "step": 10
103
  },
104
  {
105
  "epoch": 10.0,
106
  "step": 10,
107
+ "total_flos": 2093796556800.0,
108
  "train_loss": 0.0,
109
+ "train_runtime": 1.6028,
110
+ "train_samples_per_second": 149.735,
111
+ "train_steps_per_second": 6.239
112
  }
113
  ],
114
  "logging_steps": 5,
 
128
  "attributes": {}
129
  }
130
  },
131
+ "total_flos": 2093796556800.0,
132
+ "train_batch_size": 8,
133
  "trial_name": null,
134
  "trial_params": null
135
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75efd32a9fabbb81846c0c746336b54b61acdff1369f814b4bf7e9ff459e6e6d
3
  size 6968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc2995316c087d59e5c93ccf1e0a00dc7c4d1a8bab6af12aaec77bdb40108d92
3
  size 6968