mtasic85 commited on
Commit
48be84a
1 Parent(s): c6352c2

pretrain dataset

Browse files
Files changed (2) hide show
  1. scripts/TRAIN.md +11 -0
  2. scripts/pretrain-model.yaml +1 -1
scripts/TRAIN.md CHANGED
@@ -21,6 +21,17 @@ python -B train_tokenizer.py
21
  python -B prepare_pretrain_dataset.py
22
  ```
23
 
 
 
 
 
 
 
 
 
 
 
 
24
  ## Model
25
 
26
  ### Pretrain
 
21
  python -B prepare_pretrain_dataset.py
22
  ```
23
 
24
+ ```python
25
+ from litdata import StreamingDataset, StreamingDataLoader, TokensLoader
26
+
27
+ dataset = StreamingDataset(
28
+ input_dir='../pretrain-data/',
29
+ item_loader=TokensLoader(block_size=2048 + 1),
30
+ )
31
+
32
+ print(len(dataset))
33
+ ```
34
+
35
  ## Model
36
 
37
  ### Pretrain
scripts/pretrain-model.yaml CHANGED
@@ -77,7 +77,7 @@ train:
77
 
78
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
79
  # max_tokens: 3000000000000
80
- max_tokens: 8159107755 # 796399 * 2049 * 5
81
 
82
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
83
  max_steps:
 
77
 
78
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
79
  # max_tokens: 3000000000000
80
+ max_tokens: 36852166560 # 3597088 * 2049 * 5
81
 
82
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
83
  max_steps: