bloomz-7b1-4b-ru / sbatch_mtf_4b_ru.sh
Muennighoff's picture
Add code
cd12b7e
#!/bin/bash
#SBATCH --nodes=8
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32
#SBATCH --mem=256G
#SBATCH -p pilot
#SBATCH -t 48:00:00
#SBATCH --gpus-per-node=mi250:8
#SBATCH --exclusive=user
#SBATCH --hint=nomultithread
#SBATCH --account=project_462000119
#SBATCH -o logs/%j.out
#SBATCH -e logs/%j.err
# if run without sbatch, invoke here
#if [ -z $SLURM_JOB_ID ]; then
# mkdir -p logs
# sbatch "$0"
# exit
#fi
VARIANT=7b1ru2
set -euo pipefail
# symlink logs/latest.out and logs/latest.err
ln -f -s $SLURM_JOB_ID.out logs/latest.out
ln -f -s $SLURM_JOB_ID.err logs/latest.err
KILL_SWITCH_PATH=kill-switch-$VARIANT
CHECKPOINT_PATH=checkpoints_$VARIANT
TENSORBOARD_PATH=tensorboard_$VARIANT
# Data
TOKENIZER_NAME_OR_PATH=bigscience/tokenizer
TRAIN_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3ru_train.txt
VALID_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3_validation_ru.txt
PP_SIZE=1
TP_SIZE=1
MICRO_BATCH_SIZE=2
GRADIENT_ACCUMULATION_STEPS=16
WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
# Model parameters
NLAYERS=30
NHIDDEN=4096
NHEADS=32
SEQ_LEN=2048
TRAIN_SAMPLES=6_348_800
SAVE_INTERVAL=500
ZERO_STAGE=1
mkdir -p ds_configs
config_json="ds_configs/$SLURM_JOB_ID.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
CMD=" \
Megatron-DeepSpeed/finetune_t0.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-samples $TRAIN_SAMPLES \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
--init-method-std 0.0048 \
--embed-layernorm \
--fp16 \
--seed 42 \
--position-embedding-type alibi \
--abort-on-unmet-fused-kernel-constraints \
--clip-grad 1.0 \
--kill-switch-path $KILL_SWITCH_PATH \
--checkpoint-activations \
--pad-vocab-size-to 250880 \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 2e-5 \
--lr-decay-style constant \
--lr-warmup-samples 0 \
--clip-grad 1.0 \
--weight-decay 1e-4 \
--no-load-optim \
--reset-progress \
--norm-target-loss \
--log-interval 10 \
--save-interval $SAVE_INTERVAL \
--eval-interval 500 \
--eval-iters 1 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--train-weighted-split-paths-path $TRAIN_DATA_PATH \
--valid-weighted-split-paths-path $VALID_DATA_PATH \
--dataloader-type single \
--data-impl mmap \
--deepspeed \
--deepspeed_config $config_json \
--zero-stage $ZERO_STAGE \
"
echo $CMD
echo "START $SLURM_JOBID: $(date)"
srun --label launch.sh $CMD
echo "END $SLURM_JOBID: $(date)"