mtasic85 commited on
Commit
11c619a
1 Parent(s): 4804082

prepare dataset

Browse files
Files changed (1) hide show
  1. scripts/prepare_contrain_dataset.py +56 -0
scripts/prepare_contrain_dataset.py CHANGED
@@ -67,6 +67,21 @@ def batch_iterator(name=None):
67
  del dataset
68
  gc.collect()
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  if name in (None, 'datatab/ultrafeedback_binarized_serbian'):
71
  dataset = load_dataset('datatab/ultrafeedback_binarized_serbian', split='train_sft')
72
 
@@ -112,6 +127,44 @@ def batch_iterator(name=None):
112
  del dataset
113
  gc.collect()
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  def tokenize_fn(dataset_name, tokenizer=None):
117
  for text in batch_iterator(dataset_name):
@@ -123,8 +176,11 @@ datasets_names = [
123
  'Replete-AI/Everything_Instruct_Multilingual',
124
  'HuggingFaceH4/ultrachat_200k',
125
  'HuggingFaceH4/no_robots',
 
126
  'datatab/ultrafeedback_binarized_serbian',
127
  'datatab/alpaca-cleaned-serbian-full',
 
 
128
  ]
129
 
130
  outputs = optimize(
 
67
  del dataset
68
  gc.collect()
69
 
70
+ if name in (None, 'datatab/ultrachat_200k_serbian'):
71
+ dataset = load_dataset('datatab/ultrachat_200k_serbian', split='train')
72
+
73
+ for row in dataset:
74
+ text = [
75
+ f"<|im_start|>{n['role']}\n{n['content']}<|im_end|>"
76
+ for n in row['messages_srb']
77
+ ]
78
+
79
+ text = '\n'.join(text) + '\n'
80
+ yield text
81
+
82
+ del dataset
83
+ gc.collect()
84
+
85
  if name in (None, 'datatab/ultrafeedback_binarized_serbian'):
86
  dataset = load_dataset('datatab/ultrafeedback_binarized_serbian', split='train_sft')
87
 
 
127
  del dataset
128
  gc.collect()
129
 
130
+ if name in (None, 'datatab/orca_math_world_problem_200k_serbian'):
131
+ dataset = load_dataset('datatab/orca_math_world_problem_200k_serbian', split='train')
132
+
133
+ for row in dataset:
134
+ text = []
135
+
136
+ text.append(
137
+ '<|im_start|>user\n'
138
+ f"{row['question_translated_srb']}<|im_end|>"
139
+ )
140
+
141
+ text.append(
142
+ '<|im_start|>assistant\n'
143
+ f"{row['answer_translated_srb']}<|im_end|>"
144
+ )
145
+
146
+ text = '\n'.join(text) + '\n'
147
+ yield text
148
+
149
+ del dataset
150
+ gc.collect()
151
+
152
+ if name in (None, 'datatab/open-orca-slim-serbian'):
153
+ dataset = load_dataset('datatab/open-orca-slim-serbian', split='train')
154
+ role_map = {'system': 'system', 'human': 'user', 'gpt': 'assistant'}
155
+
156
+ for row in dataset['conversations']:
157
+ text = [
158
+ f"<|im_start|>{role_map[n['role']]}\n{n['value']}<|im_end|>"
159
+ for n in row['chosen']
160
+ ]
161
+
162
+ text = '\n'.join(text) + '\n'
163
+ yield text
164
+
165
+ del dataset
166
+ gc.collect()
167
+
168
 
169
  def tokenize_fn(dataset_name, tokenizer=None):
170
  for text in batch_iterator(dataset_name):
 
176
  'Replete-AI/Everything_Instruct_Multilingual',
177
  'HuggingFaceH4/ultrachat_200k',
178
  'HuggingFaceH4/no_robots',
179
+ 'datatab/ultrachat_200k_serbian',
180
  'datatab/ultrafeedback_binarized_serbian',
181
  'datatab/alpaca-cleaned-serbian-full',
182
+ 'datatab/orca_math_world_problem_200k_serbian',
183
+ 'datatab/open-orca-slim-serbian',
184
  ]
185
 
186
  outputs = optimize(