mtasic85 commited on
Commit
4804082
1 Parent(s): 7f438ae

prepare dataset

Browse files
scripts/model.yaml CHANGED
@@ -73,7 +73,7 @@ train:
73
 
74
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
75
  # max_tokens: 3000000000000
76
- max_tokens: 9782206713 # 1591379 * 2049 * 3
77
 
78
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
79
  max_steps:
 
73
 
74
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
75
  # max_tokens: 3000000000000
76
+ max_tokens: ??? # ? * 32769 * 3
77
 
78
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
79
  max_steps:
scripts/prepare_contrain_dataset.py CHANGED
@@ -33,7 +33,6 @@ def batch_iterator(name=None):
33
 
34
  text = '\n'.join(text) + '\n'
35
  yield text
36
- break
37
 
38
  del dataset
39
  gc.collect()
@@ -49,7 +48,6 @@ def batch_iterator(name=None):
49
 
50
  text = '\n'.join(text) + '\n'
51
  yield text
52
- break
53
 
54
  del dataset
55
  gc.collect()
@@ -65,7 +63,6 @@ def batch_iterator(name=None):
65
 
66
  text = '\n'.join(text) + '\n'
67
  yield text
68
- break
69
 
70
  del dataset
71
  gc.collect()
@@ -81,7 +78,6 @@ def batch_iterator(name=None):
81
 
82
  text = '\n'.join(text) + '\n'
83
  yield text
84
- break
85
 
86
  del dataset
87
  gc.collect()
@@ -112,7 +108,6 @@ def batch_iterator(name=None):
112
 
113
  text = '\n'.join(text) + '\n'
114
  yield text
115
- break
116
 
117
  del dataset
118
  gc.collect()
 
33
 
34
  text = '\n'.join(text) + '\n'
35
  yield text
 
36
 
37
  del dataset
38
  gc.collect()
 
48
 
49
  text = '\n'.join(text) + '\n'
50
  yield text
 
51
 
52
  del dataset
53
  gc.collect()
 
63
 
64
  text = '\n'.join(text) + '\n'
65
  yield text
 
66
 
67
  del dataset
68
  gc.collect()
 
78
 
79
  text = '\n'.join(text) + '\n'
80
  yield text
 
81
 
82
  del dataset
83
  gc.collect()
 
108
 
109
  text = '\n'.join(text) + '\n'
110
  yield text
 
111
 
112
  del dataset
113
  gc.collect()