prepare dataset
Browse files
scripts/prepare_contrain_dataset.py
CHANGED
@@ -67,6 +67,21 @@ def batch_iterator(name=None):
|
|
67 |
del dataset
|
68 |
gc.collect()
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
if name in (None, 'datatab/ultrafeedback_binarized_serbian'):
|
71 |
dataset = load_dataset('datatab/ultrafeedback_binarized_serbian', split='train_sft')
|
72 |
|
@@ -112,6 +127,44 @@ def batch_iterator(name=None):
|
|
112 |
del dataset
|
113 |
gc.collect()
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
def tokenize_fn(dataset_name, tokenizer=None):
|
117 |
for text in batch_iterator(dataset_name):
|
@@ -123,8 +176,11 @@ datasets_names = [
|
|
123 |
'Replete-AI/Everything_Instruct_Multilingual',
|
124 |
'HuggingFaceH4/ultrachat_200k',
|
125 |
'HuggingFaceH4/no_robots',
|
|
|
126 |
'datatab/ultrafeedback_binarized_serbian',
|
127 |
'datatab/alpaca-cleaned-serbian-full',
|
|
|
|
|
128 |
]
|
129 |
|
130 |
outputs = optimize(
|
|
|
67 |
del dataset
|
68 |
gc.collect()
|
69 |
|
70 |
+
if name in (None, 'datatab/ultrachat_200k_serbian'):
|
71 |
+
dataset = load_dataset('datatab/ultrachat_200k_serbian', split='train')
|
72 |
+
|
73 |
+
for row in dataset:
|
74 |
+
text = [
|
75 |
+
f"<|im_start|>{n['role']}\n{n['content']}<|im_end|>"
|
76 |
+
for n in row['messages_srb']
|
77 |
+
]
|
78 |
+
|
79 |
+
text = '\n'.join(text) + '\n'
|
80 |
+
yield text
|
81 |
+
|
82 |
+
del dataset
|
83 |
+
gc.collect()
|
84 |
+
|
85 |
if name in (None, 'datatab/ultrafeedback_binarized_serbian'):
|
86 |
dataset = load_dataset('datatab/ultrafeedback_binarized_serbian', split='train_sft')
|
87 |
|
|
|
127 |
del dataset
|
128 |
gc.collect()
|
129 |
|
130 |
+
if name in (None, 'datatab/orca_math_world_problem_200k_serbian'):
|
131 |
+
dataset = load_dataset('datatab/orca_math_world_problem_200k_serbian', split='train')
|
132 |
+
|
133 |
+
for row in dataset:
|
134 |
+
text = []
|
135 |
+
|
136 |
+
text.append(
|
137 |
+
'<|im_start|>user\n'
|
138 |
+
f"{row['question_translated_srb']}<|im_end|>"
|
139 |
+
)
|
140 |
+
|
141 |
+
text.append(
|
142 |
+
'<|im_start|>assistant\n'
|
143 |
+
f"{row['answer_translated_srb']}<|im_end|>"
|
144 |
+
)
|
145 |
+
|
146 |
+
text = '\n'.join(text) + '\n'
|
147 |
+
yield text
|
148 |
+
|
149 |
+
del dataset
|
150 |
+
gc.collect()
|
151 |
+
|
152 |
+
if name in (None, 'datatab/open-orca-slim-serbian'):
|
153 |
+
dataset = load_dataset('datatab/open-orca-slim-serbian', split='train')
|
154 |
+
role_map = {'system': 'system', 'human': 'user', 'gpt': 'assistant'}
|
155 |
+
|
156 |
+
for row in dataset['conversations']:
|
157 |
+
text = [
|
158 |
+
f"<|im_start|>{role_map[n['role']]}\n{n['value']}<|im_end|>"
|
159 |
+
for n in row['chosen']
|
160 |
+
]
|
161 |
+
|
162 |
+
text = '\n'.join(text) + '\n'
|
163 |
+
yield text
|
164 |
+
|
165 |
+
del dataset
|
166 |
+
gc.collect()
|
167 |
+
|
168 |
|
169 |
def tokenize_fn(dataset_name, tokenizer=None):
|
170 |
for text in batch_iterator(dataset_name):
|
|
|
176 |
'Replete-AI/Everything_Instruct_Multilingual',
|
177 |
'HuggingFaceH4/ultrachat_200k',
|
178 |
'HuggingFaceH4/no_robots',
|
179 |
+
'datatab/ultrachat_200k_serbian',
|
180 |
'datatab/ultrafeedback_binarized_serbian',
|
181 |
'datatab/alpaca-cleaned-serbian-full',
|
182 |
+
'datatab/orca_math_world_problem_200k_serbian',
|
183 |
+
'datatab/open-orca-slim-serbian',
|
184 |
]
|
185 |
|
186 |
outputs = optimize(
|