#!/bin/bash export NCCL_IB_SL=1 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NVTE_APPLY_QK_LAYER_SCALING=1 INPUT_METADATA_PATH="placeholder" GROUNDTRUTH_PATH="placeholder" while [[ $# -gt 0 ]]; do case $1 in --input-image-path) INPUT_IMAGE_PATH="$2" shift shift ;; --input-metadata-path) INPUT_METADATA_PATH="$2" shift shift ;; -g|--groundtruth-path) GROUNDTRUTH_PATH="$2" shift shift ;; -o|--output-path) OUTPUT_PATH="$2" shift shift ;; -m|--model-path) MODEL_PATH="$2" shift shift ;; -t|--tokenizer-path) TOKENIZER_PATH="$2" shift shift ;; --task) TASK="$2" shift shift ;; -g|--gt-path) GROUNDTRUTH_PATH="$2" shift shift ;; -*|--*) echo "Invalid option $1" exit 1 ;; esac done # Please modify these as needed. NUM_PARTITIONS=100 START=0 END=2 for PARTITION_ID in $( eval echo {$START..$END} ) do torchrun --nproc_per_node 4 examples/multimodal/run_text_generation.py \ --use-flash-attn \ --language-model-type 8b \ --apply-layernorm-1p \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ --position-embedding-type rope \ --rotary-percent 0.5 \ --squared-relu \ --attention-dropout 0.0 \ --hidden-dropout 0.0 \ --tensor-model-parallel-size 4 \ --pipeline-model-parallel-size 1 \ --num-layers 32 \ --hidden-size 4096 \ --num-attention-heads 32 \ --max-position-embeddings 4096 \ --no-masked-softmax-fusion \ --load ${MODEL_PATH} \ --tokenizer-type GPTSentencePieceTokenizer \ --tokenizer-model ${TOKENIZER_PATH} \ --bf16 \ --micro-batch-size 1 \ --seq-length 99 \ --out-seq-length 700 \ --temperature 1.0 \ --img-h 336 \ --img-w 336 \ --patch-dim 14 \ --seed 153 \ --top_k 1 \ --disable-vision-class-token \ --no-load-rng \ --no-load-optim \ --input-path ${INPUT_PATH} \ --num-partitions ${NUM_PARTITIONS} \ --partition-id ${PARTITION_ID} \ --output-path ${OUTPUT_PATH}/${PART_ID}.jsonl \ --gt-path ${GROUNDTRUTH_PATH} done