mtasic85 commited on
Commit
a21f3c7
1 Parent(s): fbc3c97

train model

Browse files
Files changed (1) hide show
  1. scripts/train_model.py +16 -10
scripts/train_model.py CHANGED
@@ -16,16 +16,16 @@ from transformers import (
16
 
17
 
18
  def _batch_iterator():
19
- # code
20
- dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
21
 
22
- for row in dataset:
23
- for n in row['keywords']:
24
- yield n
25
 
26
- del dataset
27
- gc.collect()
28
- return
29
 
30
  # code
31
  dataset = (
@@ -187,7 +187,14 @@ def batch_iterator():
187
  for text in _batch_iterator():
188
  for i in range(0, len(text), 2048):
189
  chunk = text[i:i + 2048]
190
- yield {'text': chunk}
 
 
 
 
 
 
 
191
 
192
 
193
  tokenizer = AutoTokenizer.from_pretrained('../')
@@ -241,7 +248,6 @@ training_args = TrainingArguments(
241
  evaluation_strategy='no',
242
  save_strategy='epoch',
243
  torch_compile=True,
244
- remove_unused_columns=False,
245
  )
246
  print(training_args)
247
 
 
16
 
17
 
18
  def _batch_iterator():
19
+ ## code
20
+ # dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
21
 
22
+ # for row in dataset:
23
+ # for n in row['keywords']:
24
+ # yield n
25
 
26
+ # del dataset
27
+ # gc.collect()
28
+ # return
29
 
30
  # code
31
  dataset = (
 
187
  for text in _batch_iterator():
188
  for i in range(0, len(text), 2048):
189
  chunk = text[i:i + 2048]
190
+ tokenized = tokenize_function(chunk)
191
+ yield tokenized
192
+
193
+
194
+ def tokenize_function(text):
195
+ outputs = tokenizer(text, truncation=True, padding='max_length', max_length=2048)
196
+ outputs['labels'] = outputs['input_ids'].copy()
197
+ return outputs
198
 
199
 
200
  tokenizer = AutoTokenizer.from_pretrained('../')
 
248
  evaluation_strategy='no',
249
  save_strategy='epoch',
250
  torch_compile=True,
 
251
  )
252
  print(training_args)
253