pretrain dataset
Browse files- scripts/TRAIN.md +11 -0
- scripts/pretrain-model.yaml +1 -1
scripts/TRAIN.md
CHANGED
@@ -21,6 +21,17 @@ python -B train_tokenizer.py
|
|
21 |
python -B prepare_pretrain_dataset.py
|
22 |
```
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
## Model
|
25 |
|
26 |
### Pretrain
|
|
|
21 |
python -B prepare_pretrain_dataset.py
|
22 |
```
|
23 |
|
24 |
+
```python
|
25 |
+
from litdata import StreamingDataset, StreamingDataLoader, TokensLoader
|
26 |
+
|
27 |
+
dataset = StreamingDataset(
|
28 |
+
input_dir='../pretrain-data/',
|
29 |
+
item_loader=TokensLoader(block_size=2048 + 1),
|
30 |
+
)
|
31 |
+
|
32 |
+
print(len(dataset))
|
33 |
+
```
|
34 |
+
|
35 |
## Model
|
36 |
|
37 |
### Pretrain
|
scripts/pretrain-model.yaml
CHANGED
@@ -77,7 +77,7 @@ train:
|
|
77 |
|
78 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
79 |
# max_tokens: 3000000000000
|
80 |
-
max_tokens:
|
81 |
|
82 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
83 |
max_steps:
|
|
|
77 |
|
78 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
79 |
# max_tokens: 3000000000000
|
80 |
+
max_tokens: 36852166560 # 3597088 * 2049 * 5
|
81 |
|
82 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
83 |
max_steps:
|