prepare dataset
Browse files
scripts/model.yaml
CHANGED
@@ -73,7 +73,7 @@ train:
|
|
73 |
|
74 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
75 |
# max_tokens: 3000000000000
|
76 |
-
max_tokens:
|
77 |
|
78 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
79 |
max_steps:
|
|
|
73 |
|
74 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
75 |
# max_tokens: 3000000000000
|
76 |
+
max_tokens: ??? # ? * 32769 * 3
|
77 |
|
78 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
79 |
max_steps:
|
scripts/prepare_contrain_dataset.py
CHANGED
@@ -33,7 +33,6 @@ def batch_iterator(name=None):
|
|
33 |
|
34 |
text = '\n'.join(text) + '\n'
|
35 |
yield text
|
36 |
-
break
|
37 |
|
38 |
del dataset
|
39 |
gc.collect()
|
@@ -49,7 +48,6 @@ def batch_iterator(name=None):
|
|
49 |
|
50 |
text = '\n'.join(text) + '\n'
|
51 |
yield text
|
52 |
-
break
|
53 |
|
54 |
del dataset
|
55 |
gc.collect()
|
@@ -65,7 +63,6 @@ def batch_iterator(name=None):
|
|
65 |
|
66 |
text = '\n'.join(text) + '\n'
|
67 |
yield text
|
68 |
-
break
|
69 |
|
70 |
del dataset
|
71 |
gc.collect()
|
@@ -81,7 +78,6 @@ def batch_iterator(name=None):
|
|
81 |
|
82 |
text = '\n'.join(text) + '\n'
|
83 |
yield text
|
84 |
-
break
|
85 |
|
86 |
del dataset
|
87 |
gc.collect()
|
@@ -112,7 +108,6 @@ def batch_iterator(name=None):
|
|
112 |
|
113 |
text = '\n'.join(text) + '\n'
|
114 |
yield text
|
115 |
-
break
|
116 |
|
117 |
del dataset
|
118 |
gc.collect()
|
|
|
33 |
|
34 |
text = '\n'.join(text) + '\n'
|
35 |
yield text
|
|
|
36 |
|
37 |
del dataset
|
38 |
gc.collect()
|
|
|
48 |
|
49 |
text = '\n'.join(text) + '\n'
|
50 |
yield text
|
|
|
51 |
|
52 |
del dataset
|
53 |
gc.collect()
|
|
|
63 |
|
64 |
text = '\n'.join(text) + '\n'
|
65 |
yield text
|
|
|
66 |
|
67 |
del dataset
|
68 |
gc.collect()
|
|
|
78 |
|
79 |
text = '\n'.join(text) + '\n'
|
80 |
yield text
|
|
|
81 |
|
82 |
del dataset
|
83 |
gc.collect()
|
|
|
108 |
|
109 |
text = '\n'.join(text) + '\n'
|
110 |
yield text
|
|
|
111 |
|
112 |
del dataset
|
113 |
gc.collect()
|