|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VARIANT=7b1ru2 |
|
|
|
set -euo pipefail |
|
|
|
|
|
ln -f -s $SLURM_JOB_ID.out logs/latest.out |
|
ln -f -s $SLURM_JOB_ID.err logs/latest.err |
|
|
|
KILL_SWITCH_PATH=kill-switch-$VARIANT |
|
CHECKPOINT_PATH=checkpoints_$VARIANT |
|
TENSORBOARD_PATH=tensorboard_$VARIANT |
|
|
|
|
|
TOKENIZER_NAME_OR_PATH=bigscience/tokenizer |
|
|
|
TRAIN_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3ru_train.txt |
|
VALID_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3_validation_ru.txt |
|
|
|
PP_SIZE=1 |
|
TP_SIZE=1 |
|
|
|
MICRO_BATCH_SIZE=2 |
|
GRADIENT_ACCUMULATION_STEPS=16 |
|
WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) |
|
GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) |
|
|
|
|
|
NLAYERS=30 |
|
NHIDDEN=4096 |
|
NHEADS=32 |
|
SEQ_LEN=2048 |
|
|
|
TRAIN_SAMPLES=6_348_800 |
|
|
|
SAVE_INTERVAL=500 |
|
|
|
ZERO_STAGE=1 |
|
|
|
mkdir -p ds_configs |
|
config_json="ds_configs/$SLURM_JOB_ID.json" |
|
|
|
cat <<EOT > $config_json |
|
{ |
|
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, |
|
"train_batch_size": $GLOBAL_BATCH_SIZE, |
|
"gradient_clipping": 1.0, |
|
"zero_optimization": { |
|
"stage": $ZERO_STAGE |
|
}, |
|
"fp16": { |
|
"enabled": true, |
|
"loss_scale": 0, |
|
"loss_scale_window": 500, |
|
"hysteresis": 2, |
|
"min_loss_scale": 1, |
|
"initial_scale_power": 12 |
|
}, |
|
"steps_per_print": 2000, |
|
"wall_clock_breakdown": false |
|
} |
|
EOT |
|
|
|
|
|
CMD=" \ |
|
Megatron-DeepSpeed/finetune_t0.py \ |
|
--tensor-model-parallel-size $TP_SIZE \ |
|
--pipeline-model-parallel-size $PP_SIZE \ |
|
--num-layers $NLAYERS \ |
|
--hidden-size $NHIDDEN \ |
|
--num-attention-heads $NHEADS \ |
|
--seq-length $SEQ_LEN \ |
|
--max-position-embeddings $SEQ_LEN \ |
|
--micro-batch-size $MICRO_BATCH_SIZE \ |
|
--global-batch-size $GLOBAL_BATCH_SIZE \ |
|
--train-samples $TRAIN_SAMPLES \ |
|
--tokenizer-type PretrainedFromHF \ |
|
--tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ |
|
--init-method-std 0.0048 \ |
|
--embed-layernorm \ |
|
--fp16 \ |
|
--seed 42 \ |
|
--position-embedding-type alibi \ |
|
--abort-on-unmet-fused-kernel-constraints \ |
|
--clip-grad 1.0 \ |
|
--kill-switch-path $KILL_SWITCH_PATH \ |
|
--checkpoint-activations \ |
|
--pad-vocab-size-to 250880 \ |
|
--optimizer adam \ |
|
--adam-beta1 0.9 \ |
|
--adam-beta2 0.95 \ |
|
--adam-eps 1e-8 \ |
|
--lr 2e-5 \ |
|
--lr-decay-style constant \ |
|
--lr-warmup-samples 0 \ |
|
--clip-grad 1.0 \ |
|
--weight-decay 1e-4 \ |
|
--no-load-optim \ |
|
--reset-progress \ |
|
--norm-target-loss \ |
|
--log-interval 10 \ |
|
--save-interval $SAVE_INTERVAL \ |
|
--eval-interval 500 \ |
|
--eval-iters 1 \ |
|
--tensorboard-dir $TENSORBOARD_PATH \ |
|
--tensorboard-queue-size 5 \ |
|
--log-timers-to-tensorboard \ |
|
--log-batch-size-to-tensorboard \ |
|
--log-validation-ppl-to-tensorboard \ |
|
--save $CHECKPOINT_PATH \ |
|
--load $CHECKPOINT_PATH \ |
|
--train-weighted-split-paths-path $TRAIN_DATA_PATH \ |
|
--valid-weighted-split-paths-path $VALID_DATA_PATH \ |
|
--dataloader-type single \ |
|
--data-impl mmap \ |
|
--deepspeed \ |
|
--deepspeed_config $config_json \ |
|
--zero-stage $ZERO_STAGE \ |
|
" |
|
|
|
echo $CMD |
|
|
|
echo "START $SLURM_JOBID: $(date)" |
|
|
|
srun --label launch.sh $CMD |
|
|
|
echo "END $SLURM_JOBID: $(date)" |
|
|