Files
heterogeneous-distributed-t…/examples/multimodal/pretrain_mistral_clip.sh
tianyutong d6ce507681 Initial Commit of Megatron-LM-0.8.0
Change-Id: Ifb4c061207ee2644a21e161ad52fc6ff40564e39
2025-05-23 09:54:48 +08:00

132 lines
3.4 KiB
Bash

#!/bin/bash
# Pretrain a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-pretraining"
# Check that the user has set an output path for model checkpoints.
if [[ -z $WORKSPACE ]]; then
echo "Please set WORKSPACE for storing your model checkpoints."
exit 1
fi
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
if [[ -z $LOAD_NAME ]]; then
echo "Please set LOAD_NAME for input model name."
exit 1
fi
if [[ -z $TOKENIZER_MODEL ]]; then
echo "Please set TOKENIZER_MODEL for tokenizer model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
DATA_VALID="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
DEBUG=0
if [[ $DEBUG -eq 1 ]]; then
BZ=32
NW=2
HD=0.0
LI=1
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
else
BZ=256
NW=2
HD=0.1
LI=10
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
fi
OPTIONS=" \
--img-embedding-idx 1 \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-checkpoint-args \
--use-distributed-optimizer \
--transformer-impl transformer_engine \
--use-te \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--num-workers ${NW} \
--exit-duration-in-mins 230 \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout ${HD} \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 4096 \
--ffn-hidden-size 14336 \
--train-iters 20000 \
--micro-batch-size 1 \
--global-batch-size ${BZ} \
--lr-decay-iters 20000 \
--lr-warmup-fraction .01 \
--lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 1000 \
--tokenizer-type MistralTokenizer \
--tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
--data-path ${DATA_TRAIN} \
--valid-path ${DATA_VALID} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
--save-interval 1000 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \
--clip-grad 1.0 \
--weight-decay 1e-2 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--log-params-norm \
--log-num-zeros-in-grad \
--bf16 \
--eod-mask-loss \
--freeze-LM \
--freeze-ViT \
--patch-dim 14 \
--img-h 336 \
--img-w 336 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type=mistral_7b \
--disable-vision-class-token \
${EXTRA_ARGS} \
--distributed-timeout-minutes 60 \
--allow-missing-vision-projection-checkpoint \
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}