diff --git a/CHANGELOG.md b/CHANGELOG.md old mode 100755 new mode 100644 diff --git a/CODEOWNERS b/CODEOWNERS old mode 100755 new mode 100644 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md old mode 100755 new mode 100644 diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev old mode 100755 new mode 100644 diff --git a/Dockerfile.ci.lts b/Dockerfile.ci.lts old mode 100755 new mode 100644 diff --git a/Dockerfile.linting b/Dockerfile.linting old mode 100755 new mode 100644 diff --git a/GPT_pretraining.sh b/GPT_pretraining.sh old mode 100755 new mode 100644 diff --git a/LICENSE b/LICENSE old mode 100755 new mode 100644 diff --git a/Llama_pretraining.sh b/Llama_pretraining.sh index 265e2b5006374539df4f79de7f5e6d5015a64d4a..1ac8a7356fc0ffc23dc0c343f38a16177513ddb8 100755 --- a/Llama_pretraining.sh +++ b/Llama_pretraining.sh @@ -1,79 +1,64 @@ #!/bin/bash set -eux - #export FLASH_ATTENTION_PRINT_PARAM=1 # Runs the "7B" parameter model export HSA_FORCE_FINE_GRAIN_PCIE=1 export OMP_NUM_THREADS=1 export NCCL_P2P_LEVEL=PXB # SYS - #export HIP_ALLOC_INITIALIZE=0 -#export GPU_MAX_HW_QUEUES=20 # sglang空泡 - +# export GPU_MAX_HW_QUEUES=10 export NCCL_ALGO=Ring export NCCL_NCHANNELS_PER_PEER=16 -export NCCL_MIN_NCHANNELS=20 +export NCCL_MIN_NCHANNELS=32 # 20 +export NCCL_MAX_NCHANNELS=32 # 20 export NCCL_IB_TIMEOUT=22 export CUDA_DEVICE_MAX_CONNECTIONS=1 - -export NCCL_IB_HCA=mlx5_1,mlx5_2 -# export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,,mlx5_4,,mlx5_5,,mlx5_6,,mlx5_7 -export NCCL_NET_GDR_LEVEL=SYS -export NCCL_NET_GDR_READ=0 +export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 +export NCCL_NET_GDR_LEVEL=7 +export NCCL_NET_GDR_READ=1 +export RCCL_SDMA_COPY_ENABLE=0 +export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml" +# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml" export GLOG_minloglevel=3 # 打印error级别的nccl日志 - -# export TORCH_COMPILE_DEBUG=1 # 查看编译后的图 - source /opt/dtk/env.sh # 导入hipblaslt库 # export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH +# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/public/home/xingjl/dependency/hipblaslt-install-0227/lib:$LD_LIBRARY_PATH # 更新rocblas # export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH # export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH -# export LD_LIBRARY_PATH=/data/rocblas-install_0107_trans/lib:$LD_LIBRARY_PATH - +export LD_LIBRARY_PATH=/public/home/xingjl/dependency/rocblas-install-0224/lib:$LD_LIBRARY_PATH +# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH # torch控制多流转单流 -# export ALLREDUCE_STREAM_WITH_COMPUTE=1 - +export ALLREDUCE_STREAM_WITH_COMPUTE=1 +export SENDRECV_STREAM_WITH_COMPUTE=1 # prof采集添加同步, 避免卡顿 # export GPU_FLUSH_ON_EXECUTION=1 # export HIP_DIRECT_DISPATCH=0 - # 采集rocblas size -export ROCBLAS_LAYER=3 +# export ROCBLAS_LAYER=3 # 采集 fa size # export FLASH_ATTENTION_PRINT_PARAM=1 - -CHECKPOINT_PATH=./tmp_7b #$1 # +#增加编译缓存 +export cache_size_limit=64 +# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # +SAVE_PATH=./tmp_7b TENSORBOARD_LOGS_PATH=./tmp_7b #$2 # -DATA_PATH="/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" - +DATA_PATH="/public/home/xingjl/megatron-lm/llama2_dataset/oscar-1GB_head-llama2_text_document" #_text_document +# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" #_text_document GPT_MODEL_ARGS=( --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 - --normalization RMSNorm - --position-embedding-type rope + --position-embedding-type rope # none # --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性 ) - -# GPT_MODEL_ARGS=( -# --num-layers 40 -# --hidden-size 5120 -# --ffn-hidden-size 13824 -# --num-attention-heads 40 -# --max-position-embeddings 4096 - -# --normalization RMSNorm -# --position-embedding-type rope -# --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性 -# ) - -# export NVTE_FLASH_ATTN=1 # 走cutlass -export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa +export NVTE_FLASH_ATTN=1 # 走cutlass +# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa # --transformer-impl transformer_engine # 走core用这两组参数 # --use-mcore-models # --transformer-impl local # 走legacy用这两组参数 @@ -82,8 +67,8 @@ TRAINING_ARGS=( --transformer-impl local # 走legacy用这两组参数 --use-legacy-models --micro-batch-size 1 - --global-batch-size 64 #240 #60 #512 #64 - --train-iters 10 + --global-batch-size 64 #32 #240 #60 #512 #64 + --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 @@ -96,7 +81,7 @@ TRAINING_ARGS=( --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 - --no-gradient-accumulation-fusion # 开启后精度不对, apex更新后可以开启 + # --no-gradient-accumulation-fusion --swiglu --lr 3.0e-5 --lr-decay-style cosine @@ -109,37 +94,52 @@ TRAINING_ARGS=( # --recompute-method block --overlap-grad-reduce # 重叠ddp grad reduce # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配 - # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 优化项未适配 + # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠 --use-flash-attn-cutlass ) +# 使用torch fa的环境变量 +# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 +# export TORCHINDUCTOR_BENCHMARK_FUSION=1 +# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1 +# export TORCHINDUCTOR_MAX_AUTOTUNE=1 +# export TORCHINDUCTOR_CACHE_DIR=./cache # --use-flash-attn-cutlass # cutlass fa # --use-flash-attn-triton # triton fa - +# --use-flash-attn-torch # torch fa MODEL_PARALLEL_ARGS=( --sequence-parallel - --tensor-model-parallel-size 2 + --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 + # --context-parallel-size 2 + # --num-layers-per-virtual-pipeline-stage 4 + # --microbatch-group-size-per-virtual-pipeline-stage 1 +# --no-overlap-p2p-communication # 开启后 ) - DATA_ARGS=( --data-path $DATA_PATH --seq-length 4096 #4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer - --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model + --tokenizer-model /public/home/xingjl/megatron-lm/llama2_dataset/tokenizer.model + # --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model ) - EVAL_AND_LOGGING_ARGS=( --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 - --save $CHECKPOINT_PATH - --load $CHECKPOINT_PATH - --eval-iters 10 + #--save $SAVE_PATH + #--load $SAVE_PATH + --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH ) - +# FINETUNE_ARGS=( +# # --finetune +# # --pretrained-checkpoint $CHECKPOINT_PATH +# --load $CHECKPOINT_PATH +# --no-load-optim +# --no-load-rng +# ) PROFILE_ARGS=( --profile --profile-step-start 4 @@ -148,20 +148,17 @@ PROFILE_ARGS=( --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data ) - RANK=$OMPI_COMM_WORLD_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK WORLD_SIZE=$OMPI_COMM_WORLD_SIZE DIST_URL=${1} -DIST_PORT=34567 - +DIST_PORT=34577 DISTRIBUTED_ARGS=( --rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT} ) - APP="python -u pretrain_gpt.py \ ${GPT_MODEL_ARGS[@]} \ ${TRAINING_ARGS[@]} \ @@ -169,53 +166,52 @@ APP="python -u pretrain_gpt.py \ ${DATA_ARGS[@]} \ ${EVAL_AND_LOGGING_ARGS[@]} \ ${DISTRIBUTED_ARGS[@]} \ - " # 开启profile # ${PROFILE_ARGS[@]} \ - +# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #, export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # # 4,5,6,7 #, # export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3, -${APP} -# case ${LOCAL_RANK} in -# [0]) -# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -# ${APP} -# # numactl --cpunodebind=0 --membind=0 ${APP} -# ;; -# [1]) -# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -# ${APP} -# # numactl --cpunodebind=0 --membind=0 ${APP} -# ;; -# [2]) -# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -# ${APP} -# # numactl --cpunodebind=0 --membind=0 ${APP} -# ;; -# [3]) -# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -# ${APP} -# # numactl --cpunodebind=0 --membind=0 ${APP} -# ;; -# [4]) -# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -# ${APP} -# # numactl --cpunodebind=0 --membind=0 ${APP} -# ;; -# [5]) -# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -# ${APP} -# # numactl --cpunodebind=0 --membind=0 ${APP} -# ;; -# [6]) -# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -# ${APP} -# # numactl --cpunodebind=0 --membind=0 ${APP} -# ;; -# [7]) -# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -# ${APP} -# # numactl --cpunodebind=0 --membind=0 ${APP} -# ;; -# esac +# ${APP} +case ${LOCAL_RANK} in +[0]) + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP} + numactl --cpunodebind=0 --membind=0 ${APP} + ;; +[1]) + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP} + numactl --cpunodebind=1 --membind=1 ${APP} + ;; +[2]) + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP} + numactl --cpunodebind=2 --membind=2 ${APP} + ;; +[3]) + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + numactl --cpunodebind=3 --membind=3 ${APP} + # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP} + ;; +[4]) + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + numactl --cpunodebind=4 --membind=4 ${APP} + # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP} + ;; +[5]) + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + numactl --cpunodebind=5 --membind=5 ${APP} + # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP} + ;; +[6]) + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + numactl --cpunodebind=6 --membind=6 ${APP} + # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP} + ;; +[7]) + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + numactl --cpunodebind=7 --membind=7 ${APP} + # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP} + ;; +esac \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in old mode 100755 new mode 100644 diff --git a/README.md.origin b/README.md.origin old mode 100755 new mode 100644 diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/context_parallel.rst b/docs/source/api-guide/context_parallel.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/datasets.rst b/docs/source/api-guide/datasets.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/dist_checkpointing.rst b/docs/source/api-guide/dist_checkpointing.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/dist_checkpointing.strategies.rst b/docs/source/api-guide/dist_checkpointing.strategies.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/dist_optimizer.md b/docs/source/api-guide/dist_optimizer.md old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/distributed.rst b/docs/source/api-guide/distributed.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/encoder_decoder_parallelism.rst b/docs/source/api-guide/encoder_decoder_parallelism.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/fusions.rst b/docs/source/api-guide/fusions.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/index.rst b/docs/source/api-guide/index.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/models.bert.rst b/docs/source/api-guide/models.bert.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/models.gpt.rst b/docs/source/api-guide/models.gpt.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/models.rst b/docs/source/api-guide/models.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/models.t5.rst b/docs/source/api-guide/models.t5.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/moe.rst b/docs/source/api-guide/moe.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/num_microbatches_calculator.rst b/docs/source/api-guide/num_microbatches_calculator.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/optimizer_param_scheduler.rst b/docs/source/api-guide/optimizer_param_scheduler.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/pipeline_parallel.rst b/docs/source/api-guide/pipeline_parallel.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/tensor_parallel.rst b/docs/source/api-guide/tensor_parallel.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/transformer.rst b/docs/source/api-guide/transformer.rst old mode 100755 new mode 100644 diff --git a/docs/source/images/context_parallel/CP_overview.png b/docs/source/images/context_parallel/CP_overview.png old mode 100755 new mode 100644 diff --git a/docs/source/images/context_parallel/CP_results.png b/docs/source/images/context_parallel/CP_results.png old mode 100755 new mode 100644 diff --git a/docs/source/images/distrib_optimizer/data_flow.png b/docs/source/images/distrib_optimizer/data_flow.png old mode 100755 new mode 100644 diff --git a/docs/source/images/distrib_optimizer/sharding_scheme.png b/docs/source/images/distrib_optimizer/sharding_scheme.png old mode 100755 new mode 100644 diff --git a/docs/source/images/moe/token_drop.png b/docs/source/images/moe/token_drop.png old mode 100755 new mode 100644 diff --git a/docs/source/index.rst b/docs/source/index.rst old mode 100755 new mode 100644 diff --git a/docs/source/user-guide/index.rst b/docs/source/user-guide/index.rst old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/README.md b/examples/academic_paper_scripts/detxoify_lm/README.md old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py b/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh b/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/perspective_api.py b/examples/academic_paper_scripts/detxoify_lm/perspective_api.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh b/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/README.md b/examples/academic_paper_scripts/msdp/README.md old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/data_processing.sh b/examples/academic_paper_scripts/msdp/data_processing.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh b/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/eval_resp_generation.sh b/examples/academic_paper_scripts/msdp/eval_resp_generation.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/prep_resp_gen.sh b/examples/academic_paper_scripts/msdp/prep_resp_gen.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh b/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh b/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/CONFIG.sh b/examples/academic_paper_scripts/sc21/CONFIG.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/README.md b/examples/academic_paper_scripts/sc21/README.md old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/SBATCH.sh b/examples/academic_paper_scripts/sc21/SBATCH.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/SRUN.sh b/examples/academic_paper_scripts/sc21/SRUN.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_11.sh b/examples/academic_paper_scripts/sc21/run_figure_11.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_12.sh b/examples/academic_paper_scripts/sc21/run_figure_12.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_13.sh b/examples/academic_paper_scripts/sc21/run_figure_13.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_14.sh b/examples/academic_paper_scripts/sc21/run_figure_14.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_15.sh b/examples/academic_paper_scripts/sc21/run_figure_15.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_16.sh b/examples/academic_paper_scripts/sc21/run_figure_16.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_17.sh b/examples/academic_paper_scripts/sc21/run_figure_17.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_18.sh b/examples/academic_paper_scripts/sc21/run_figure_18.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_table_1.sh b/examples/academic_paper_scripts/sc21/run_table_1.sh old mode 100755 new mode 100644 diff --git a/examples/bert/README.md b/examples/bert/README.md old mode 100755 new mode 100644 diff --git a/examples/bert/train_bert_340m_distributed.sh b/examples/bert/train_bert_340m_distributed.sh old mode 100755 new mode 100644 diff --git a/examples/export/README.md b/examples/export/README.md old mode 100755 new mode 100644 diff --git a/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py b/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/README.md b/examples/export/ptq_and_trtllm_export/README.md old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py b/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py old mode 100755 new mode 100644 diff --git a/examples/export/trtllm_export/README.md b/examples/export/trtllm_export/README.md old mode 100755 new mode 100644 diff --git a/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py old mode 100755 new mode 100644 diff --git a/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py old mode 100755 new mode 100644 diff --git a/examples/gpt3/README.md b/examples/gpt3/README.md old mode 100755 new mode 100644 diff --git a/examples/gpt3/gpt_config.yaml b/examples/gpt3/gpt_config.yaml old mode 100755 new mode 100644 index 443e4b79b88daf8d3c3b0ed0bc5cae04529db940..06257827fdfbd32d262d0da032930ebbaaf578aa --- a/examples/gpt3/gpt_config.yaml +++ b/examples/gpt3/gpt_config.yaml @@ -63,6 +63,7 @@ language_model: # MoE related moe_router_load_balancing_type: "aux_loss" moe_router_topk: 2 + moe_router_topk_limited_devices: null moe_grouped_gemm: False moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss. moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss diff --git a/examples/gpt3/train_gpt3_175b_distributed.sh b/examples/gpt3/train_gpt3_175b_distributed.sh old mode 100755 new mode 100644 diff --git a/examples/inference/README.md b/examples/inference/README.md old mode 100755 new mode 100644 index bd8e738e55b60f38c94323a7adf445e3f7474a7e..b4b07cbc6ab88a1b3453bcecbb9534d8026a6f64 --- a/examples/inference/README.md +++ b/examples/inference/README.md @@ -1,5 +1,5 @@ ### Megatron Core Inference Documentation -This guide will walk you through how you can use megatron core for inference on your models. +This guide provides an example for Megatron Core for running model inference. ### Contents - [Megatron Core Inference Documentation](#megatron-core-inference-documentation) @@ -18,21 +18,21 @@ This guide will walk you through how you can use megatron core for inference on
#### 1. Quick Start -This will walk you through the flow of running batch inference on a GPT model trained using megatron core. The file can be found at [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py) +This example runs batch inference on a GPT model trained using Megatron Core. The entrypoint is [simple_gpt_batch_inference.py](./gpt/gpt_batch_inference.py)
-##### 1.1 Understanding The Code -***STEP 1 - We initialize model parallel and other default arguments*** -We can default micro batch size to be 1, since for TP models it is not used, and for PP models it is calculated during runtime. +##### 1.1 Code Walkthrough +***STEP 1 - Initialize model parallel and other default arguments*** +The micro batch size is set as 1 as it is not used in tensor-parallelism only, and for pipeline-parallel models it is calculated at runtime. ```python initialize_megatron( args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1} ) ``` -***STEP 2 - We load the model using the model_provider_function*** -NOTE: The model provider function in the script supports MCore and Legacy models. +***STEP 2 - Load the model using the model_provider_function*** +NOTE: The model provider function supports both MCore and Legacy models. ```python model = get_model(model_provider, wrap_with_ddp=False) @@ -41,10 +41,10 @@ NOTE: The model provider function in the script supports MCore and Legacy models ``` ***STEP 3 - Choose an engine*** -One of the important elements of the generate function is an inference engine. In this example we will be choosing the [megatron core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [simple text generation controller](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py), the default engine. Other engines that will be supported in the future are TRTLLMEngine. +Text generation requires an inference engine, which includes a scheduler. The default engine is the [Megatron Core engine](../../megatron/core/inference/engine/mcore_engine.py) with a simple [text generation controller](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py). TRTLLMEngine will be supported in the future. ```python inference_wrapped_model = GPTInferenceWrapper(model, args) - text_generation_controller = SimpleTextGenerationController( + text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer ) @@ -53,12 +53,12 @@ One of the important elements of the generate function is an inference engine. I ) ``` -***STEP 4 - Run the generate function and display results*** -We use default values for the [common inference params](../../megatron/core/inference/common_inference_params.py). Customize this if you want to change top_p, top_k, number of tokens to generate etc. -*Note that the result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)* +***STEP 4 - Run text generation*** +The [SamplingParams](../../megatron/core/inference/sampling_params.py) contains suggested defaults. Customize this to change top_p, top_k, number of tokens to generate etc. +*Note: The result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)* ```python results: List[InferenceRequest] = inference_engine.generate( - prompts=args.prompts, common_inference_params=common_inference_params + prompts=args.prompts, sampling_params=sampling_params ) if torch.distributed.get_rank() == 0: @@ -76,12 +76,12 @@ We use default values for the [common inference params](../../megatron/core/infe
##### 1.2 Running The Code -An example run script is shown below. Change the tokenizer paths, inference params, and other settings for your model. +An example run script is shown below. Set the tokenizer paths, inference params, and other settings appropriately. -For a quick recap on inference params refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910) +For a quick recap on sampling parameters, refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910). ``` -#In a slurm cluster (You could also use docker) +# In a slurm cluster (You could also use docker) ACCOUNT= MLM_PATH=/path/to/megatron-lm GPT_CKPT=/path/to/gpt/ckpt @@ -133,8 +133,8 @@ NOTE: Other parameters which can be customized for inference are :- --top_p (top_p sampling) --num-tokens-to-generate (Number of tokens to generate for each prompt) --inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.') ---use-dist-ckpt (If you are using dist checkpoint format for the model) ---use-legacy-models (If you are using legacy gpt model instead of mcore gpt model) +--use-dist-ckpt (If using dist checkpoint format for the model) +--use-legacy-models (If using legacy gpt model instead of mcore gpt model) ``` @@ -142,16 +142,17 @@ NOTE: Other parameters which can be customized for inference are :-
-#### 2. Flow of Control In MCore Backend -The following is what happens in the [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py). -* We call [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function with all our input prompts. -* The scheduler in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until we hit the max batch size, and then it will put the rest in the waiting requests pool. -* The engine will then run until all requests (waiting + active) are completed +#### 2. Control Flow in the MCore Backend +An example of inference with static batching is provided in [gpt_batch_inference.py](./gpt/gpt_batch_inference.py). +* [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function is called with the input prompts. +* The `Scheduler` in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until max batch size is hit. Remaining requests will be added to the waiting requests pool. +* The engine will run until all requests (waiting + active) are completed. * The active requests are passed into **generate_all_output_tokens_static_batch()** of the text generation controller . - * This function uses the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) **prep_model_for_inference()** , and then runs an auto regressive loop - * In the auto regressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to get the required input, passes it into the **run_one_forward_step()** method, which calls the appropriate (PP, TP) model `.forward()` methods to get the output logits - * The output logits are synchronized across all pipeline parallel ranks - * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the common inference parameters. + * This function uses the **prep_model_for_inference()** method of the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) and runs an autoregressive sampling loop + * In the autoregressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to slice out the input tokens and masks + * Input tokens and masks are passed it into the **run_one_forward_step()** method, which calls the model `.forward()` method to get the output logits + * Output logits are synchronized across all pipeline parallel ranks + * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the sampling parameters. * The sampled tokens are then appended to the input prompt tokens for the next iteration * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. @@ -160,16 +161,18 @@ The following is what happens in the [simple_gpt_batch_inference.py](./gpt/simpl
#### 3. Customizing The Inference Pipeline -The following guide will walk you through how you can customize different parts of the inference pipeline. There are three levels at which you can customize the pipeline. -* **Inference engine** - Highest level of customization. Currently we support the MCore Engine. Change this to add a new engine. -* **Text generation controller** - Extend this to customize tokenization, detokenization, or implement a new sampling strategy. + +The inference pipeline supports three levels of customization: + +* **Inference engine** - The MCore Engine is currently supported. Change this to add a new backend. +* **Text generation controller** - The main sampling loop. This can be customized to support alternative tokenization, detokenization, or to implement a new sampling strategy. * **Inference Wrapped Model** - Change this to support a new model. * **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, or other sampling parameters.
##### 3.1. Create Your Own Inference Backend -This is the highest level of customization. The [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file has a generate method that can be extended to support a new backend. +The [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file contains a `generate` method that can be extended to support a new backend. ```python class AbstractEngine(ABC): @@ -177,15 +180,17 @@ class AbstractEngine(ABC): def generate(self) -> dict: """The abstract backend's generate function. - To define your own backend, make sure you implement this and return the outputs as a dictionary . - + To define a new backend, implement this method and return the outputs as a dictionary. +```
-##### 3.2. Create Your Own Text Generation Controller -In case you want to use the megatron core backend, but would like to overwrite the tokenization, text generation or detokenization extend the [simple_text_generation_controller.py](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py). The class has the following methods +##### 3.2. Implement a new Sampling Loop + +The [TextGenerationController](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py) contains the main sampling loop and can be modified to support new tokenization, detokenization, or sampling strategies. + ``` python -class SimpleTextGenerationController: +class TextGenerationController: def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]: """Utility to tokenize the input prompts""" @@ -193,12 +198,12 @@ class SimpleTextGenerationController: def sample_from_logits( self, last_token_logits: torch.Tensor, - common_inference_params: CommonInferenceParams, + sampling_params: SamplingParams, vocab_size: int, ) -> torch.Tensor: """Samples the logits to generate outputs - Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples + Given the logits of the last token, this function samples according to the parameters defined in sampling_params and returns the sampled tokens. """ def update_generation_status( @@ -229,12 +234,12 @@ class SimpleTextGenerationController:
##### 3.3. Support Other Models -In order to support other models please extend the [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) file. The abstract wrapper already supports the following : -* Forward method which automatically calls the appropriate forward method (PP or TP etc) depending on model parallel settings -* Initalizes the model and puts it in eval mode -* Obtains the input parameters (batch size, max seq length) and has an instance of the input +Extend [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) to support other models. The abstract model wrapper implements: +* Forward method which calls the model `forward` method depending on model parallel settings +* Initializes the model and puts it in `.eval()` mode +* Setup for the input parameters (max batch size, max seq length) -The main methods to change for your model might be the following: +The following methods should be implemented: ```python class AbstractModelInferenceWrapper: def prep_model_for_inference(self, prompts_tokens: torch.Tensor): @@ -247,28 +252,28 @@ class AbstractModelInferenceWrapper: def get_batch_for_context_window(self) -> List: """Returns the input data for inference - This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. + This function gets called iteratively in the inference loop. It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. ``` -Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of extending this for GPTModel. +Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of implementing this for GPTModel.
##### 3.3. Modify Inference Parameters -We use [common inference params](../../megatron/core/inference/common_inference_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below +We use [common inference params](../../megatron/core/inference/sampling_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below ``` -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams -c = CommonInferenceParams(temperature=0.5) +c = SamplingParams(temperature=0.5) c.add_attributes({'min_length':4, 'eod_id':153}) ```
#### 4. Future work -The following are planned for the future releases . +The following features are planned for the future releases. * Dynamic batching * Paged Attention * TRTLLM Engine support -* Support for Multimodal model inference \ No newline at end of file +* Support for multimodal inference \ No newline at end of file diff --git a/examples/inference/gpt/simple_gpt_batch_inference.py b/examples/inference/gpt/gpt_batch_inference.py old mode 100755 new mode 100644 similarity index 91% rename from examples/inference/gpt/simple_gpt_batch_inference.py rename to examples/inference/gpt/gpt_batch_inference.py index 5c7ae5bd773cd41437650caa01e06664c7e506c2..050b230cef70d56203b7f9270a6166d7251f0769 --- a/examples/inference/gpt/simple_gpt_batch_inference.py +++ b/examples/inference/gpt/gpt_batch_inference.py @@ -6,10 +6,10 @@ import sys from argparse import Namespace from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.mcore_engine import MCoreEngine -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper from megatron.core.inference.inference_request import InferenceRequest -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import SimpleTextGenerationController +from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController from megatron.core.transformer.module import MegatronModule sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))) @@ -66,7 +66,7 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi ) inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) - text_generation_controller = SimpleTextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer) + text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer) return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size) def main(): @@ -89,7 +89,7 @@ def main(): inference_engine = get_inference_engine(args, model) - common_inference_params = CommonInferenceParams( + sampling_params = SamplingParams( temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, @@ -97,7 +97,7 @@ def main(): num_tokens_to_generate=args.num_tokens_to_generate) results: List[InferenceRequest] = inference_engine.generate( - prompts=args.prompts, common_inference_params=common_inference_params + prompts=args.prompts, sampling_params=sampling_params ) if torch.distributed.get_rank() == 0: diff --git a/examples/inference/llama_mistral/huggingface_reference.py b/examples/inference/llama_mistral/huggingface_reference.py old mode 100755 new mode 100644 diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh old mode 100755 new mode 100644 diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.sh b/examples/inference/llama_mistral/run_text_generation_llama3.sh old mode 100755 new mode 100644 diff --git a/examples/inference/llama_mistral/run_text_generation_mistral.sh b/examples/inference/llama_mistral/run_text_generation_mistral.sh old mode 100755 new mode 100644 diff --git a/examples/inference/run_text_generation_server_345M.sh b/examples/inference/run_text_generation_server_345M.sh old mode 100755 new mode 100644 diff --git a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh old mode 100755 new mode 100644 diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py old mode 100755 new mode 100644 index 3f4557d3c2dac2ae1394adfae6d79899d9b0aa11..b4226d7de0f8352fd74bedf047559f0a7819ea84 --- a/examples/inference/t5/simple_t5_batch_inference.py +++ b/examples/inference/t5/simple_t5_batch_inference.py @@ -5,7 +5,7 @@ from argparse import Namespace import torch import pretrain_t5 -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.mcore_engine import MCoreEngine from megatron.core.inference.inference_request import InferenceRequest @@ -120,7 +120,7 @@ def main(): inference_engine = get_inference_engine(args, model) - common_inference_params = CommonInferenceParams( + sampling_params = SamplingParams( temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, @@ -138,7 +138,7 @@ def main(): prompts=args.prompts, add_BOS=True, encoder_prompts=args.encoder_prompts, - common_inference_params=common_inference_params, + sampling_params=sampling_params, ) if torch.distributed.get_rank() == 0: diff --git a/examples/mamba/.gitignore b/examples/mamba/.gitignore old mode 100755 new mode 100644 diff --git a/examples/mamba/Dockerfile b/examples/mamba/Dockerfile old mode 100755 new mode 100644 diff --git a/examples/mamba/README.md b/examples/mamba/README.md old mode 100755 new mode 100644 diff --git a/examples/mamba/run_text_gen_server_8b.sh b/examples/mamba/run_text_gen_server_8b.sh old mode 100755 new mode 100644 diff --git a/examples/mamba/run_text_gen_server_8b_gpt3.sh b/examples/mamba/run_text_gen_server_8b_gpt3.sh old mode 100755 new mode 100644 diff --git a/examples/mamba/train.sh b/examples/mamba/train.sh old mode 100755 new mode 100644 diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md old mode 100755 new mode 100644 diff --git a/examples/mixtral/train_mixtral_8x7b_distributed.sh b/examples/mixtral/train_mixtral_8x7b_distributed.sh old mode 100755 new mode 100644 diff --git a/examples/multimodal/Dockerfile b/examples/multimodal/Dockerfile old mode 100755 new mode 100644 diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md old mode 100755 new mode 100644 index 62e47567b939865fa73346dc8e452f18f02685b4..a65839f8f15f0ada9a38bc5081e74e6251c298d6 --- a/examples/multimodal/README.md +++ b/examples/multimodal/README.md @@ -16,7 +16,7 @@ You can build a docker container using `examples/multimodal/Dockerfile` to run t ### Language model -Follow the instructions in [Mistral](../../docs/llama_mistral.md#mistral-7b) to download weights for Mistral-7B-Instruct-v0.3 (Base or Instruct) from HuggingFace and convert to mcore format with tensor parallel size 4. +Follow the instructions in [Mistral](../../docs/llama_mistral.md#mistral-7b) to download weights for Mistral-7B-Instruct-v0.3 from HuggingFace and convert to mcore format with tensor parallel size 4. Please use the tokenizer from HuggingFace. ### Vision model @@ -113,7 +113,7 @@ Run the following script: ``` examples/multimodal/text_generation_mistral_clip.sh --input-image-path /path/to/input/images --output-path /some/output/directory \ - --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer/ --gt-path /path/to/groundtruth/file --task generation-task-name + --model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name ``` where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning` or `MMMU`. diff --git a/examples/multimodal/assets/pretrain_curves.png b/examples/multimodal/assets/pretrain_curves.png old mode 100755 new mode 100644 diff --git a/examples/multimodal/combine_lm_vision_checkpoints.sh b/examples/multimodal/combine_lm_vision_checkpoints.sh old mode 100755 new mode 100644 diff --git a/examples/multimodal/combine_state_dicts.py b/examples/multimodal/combine_state_dicts.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py old mode 100755 new mode 100644 index 343fcd589688b3e5bf1254189450e4fb06b88b6f..ee404604b650d32f4535a53dfba24498d9ab4f77 --- a/examples/multimodal/config.py +++ b/examples/multimodal/config.py @@ -7,34 +7,20 @@ from megatron.training.activations import fast_gelu, quick_gelu, squared_relu def get_language_model_config(config): - if config.language_model_type == "2b": + if config.language_model_type == "llama3_8b": + config.activation_func = torch.nn.functional.silu config.add_bias_linear = False config.bias_activation_fusion = False config.gated_linear_unit = True - config.apply_query_key_layer_scaling = True - config.layernorm_zero_centered_gamma = True - config.bias_dropout_fusion = False - config.rotary_percent = 0.5 - config.apply_rope_fusion = False - config.attention_softmax_in_fp32 = True - elif config.language_model_type == "8b": - config.add_bias_linear = False - config.bias_activation_fusion = False - config.gated_linear_unit = False - config.apply_query_key_layer_scaling = True - config.layernorm_zero_centered_gamma = True + config.apply_query_key_layer_scaling = False + config.layernorm_zero_centered_gamma = ( + False # Zero centered gamma not supported for RMSNorm + ) config.bias_dropout_fusion = False - config.rotary_percent = 0.5 - config.attention_dropout = 0.0 config.apply_rope_fusion = False - config.activation_func = squared_relu - config.ffn_hidden_size = 16384 - config.masked_softmax_fusion = True config.attention_softmax_in_fp32 = True - config.num_query_groups = 32 - config.kv_channels = 128 - config.rotary_interleaved = False - elif config.language_model_type == "llama3_8b": + config.ffn_hidden_size = 14336 + elif config.language_model_type == "mistral_7b": config.activation_func = torch.nn.functional.silu config.add_bias_linear = False config.bias_activation_fusion = False @@ -47,7 +33,7 @@ def get_language_model_config(config): config.apply_rope_fusion = False config.attention_softmax_in_fp32 = True config.ffn_hidden_size = 14336 - elif config.language_model_type == "mistral_7b": + elif config.language_model_type == "yi-34b": config.activation_func = torch.nn.functional.silu config.add_bias_linear = False config.bias_activation_fusion = False @@ -59,10 +45,11 @@ def get_language_model_config(config): config.bias_dropout_fusion = False config.apply_rope_fusion = False config.attention_softmax_in_fp32 = True - config.ffn_hidden_size = 14336 - elif config.language_model_type == "yi-34b": + config.ffn_hidden_size = 20480 + elif config.language_model_type == "qwen2.5_7B": config.activation_func = torch.nn.functional.silu config.add_bias_linear = False + config.add_qkv_bias = True config.bias_activation_fusion = False config.gated_linear_unit = True config.apply_query_key_layer_scaling = False @@ -72,7 +59,7 @@ def get_language_model_config(config): config.bias_dropout_fusion = False config.apply_rope_fusion = False config.attention_softmax_in_fp32 = True - config.ffn_hidden_size = 20480 + config.ffn_hidden_size = 18944 elif config.language_model_type == "qwen2.0_72B": config.activation_func = torch.nn.functional.silu config.add_bias_linear = False @@ -168,13 +155,7 @@ def get_vision_projection_config(config, hidden_size): config.bias_activation_fusion = False config.add_bias_linear = False config.hidden_size = hidden_size # Used as the vision projection output size, i.e., the input to the language model. - if config.language_model_type == "2b": - config.ffn_hidden_size = 5440 - config.activation_func = torch.nn.functional.gelu - if config.language_model_type == "8b": - config.ffn_hidden_size = 16384 - config.activation_func = squared_relu - elif config.language_model_type == "llama3_8b": + if config.language_model_type == "llama3_8b": config.ffn_hidden_size = 14336 config.activation_func = torch.nn.functional.gelu elif config.language_model_type == "mistral_7b": @@ -185,6 +166,9 @@ def get_vision_projection_config(config, hidden_size): config.ffn_hidden_size = 20480 config.normalization = "LayerNorm" config.activation_func = torch.nn.functional.gelu + elif config.language_model_type == "qwen2.5_7B": + config.ffn_hidden_size = 3584 + config.activation_func = torch.nn.functional.gelu elif config.language_model_type == "qwen2.0_72B": config.ffn_hidden_size = 29568 config.normalization = "LayerNorm" diff --git a/examples/multimodal/convert_llava_pretrain_to_wds.py b/examples/multimodal/convert_llava_pretrain_to_wds.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/dataset_helpers.py b/examples/multimodal/dataset_helpers.py old mode 100755 new mode 100644 index de76f8e45e3a32e3e2a429128ee484d4185e39f9..ecbbc502c08bcda12d52c74eaabdbd3ffc3d774b --- a/examples/multimodal/dataset_helpers.py +++ b/examples/multimodal/dataset_helpers.py @@ -2,16 +2,19 @@ import bisect import dataclasses import json +import re import sys import traceback from dataclasses import dataclass from typing import Dict, List, Optional, Tuple, Union from image_processing import get_visual_transform +from PIL import Image +from torchvision.transforms import ToPILImage import numpy as np import torch -from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN +from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN, VIDEO_TOKEN from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings from megatron.energon import ( Batch, @@ -175,6 +178,10 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, self.img_h, self.img_w = self.args.img_h, self.args.img_w + # This map is used to reduce the number of tiles used per image if the number of tokens is + # larger than the decoder_seq_length. + self.num_tiles_degradation_map = {12:8, 8:6, 6:4, 4:2, 2:1, 1:1} + def _get_total_seq_length(self, input_ids, num_tiles): """Calculate expected sequence length given text tokens length and number of tiles.""" total_num_images = len(num_tiles) @@ -237,7 +244,7 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, prompt_idx = np.random.randint(len(prompt_list)) cur_prompt = prompt_list[prompt_idx] - cur_prompt = "\n" + cur_prompt + "\n" + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt + "\n" caption = sample.caption.strip() @@ -282,7 +289,7 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, # LLAVA training: override text-prompt with just the image. conv = [ # Note: no system message. - {"role": "user", "content": "\n"}, + {"role": "user", "content": IMAGE_TOKEN + "\n"}, {"role": "assistant", "content": sample.answers}, ] @@ -307,66 +314,130 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, """Encode SFT sample.""" augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False - has_image = sample.__subflavors__['has_image'] if 'has_image' in sample.__subflavors__ else False - has_image = has_image or (hasattr(sample, "images") and len(sample.images) > 0) - if has_video: - # Grab the selected frames of the video as a tensor with shape - # fhwc: (num_frames, height, width, num_channels). - video_fhwc = sample.images[0].permute(0, 2, 3, 1) - selected_frames = torch.linspace( - 0, video_fhwc.shape[0] - 1, self.args.num_frames).long() - video_frame_fhwc = video_fhwc[selected_frames] - imgs = [] - for video_frame_hwc in video_frame_fhwc: - imgs += get_visual_transform( - video_frame_hwc, self.img_h, self.img_w, - self.args.use_tiling, self.args.max_num_tiles, - self.args.use_thumbnail, augment, self.args.vision_model_type) - num_tiles = [len(imgs)] - elif has_image: - imgs = get_visual_transform( - sample.images[0], self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment, - self.args.vision_model_type, - ) - num_tiles = [len(imgs)] - else: - imgs = num_tiles = [] - sample.__key__ = "{}-{}".format("no-image", sample.__key__) + has_image = False + if hasattr(sample, "images"): + # If this is a text-only sample and we are freezing the LM, + # then use a dummy input image. + if len(sample.images) == 0 and self.args.freeze_LM: + empty_img = Image.new('RGB', (self.args.img_w, self.args.img_h), (255, 255, 255)) + sample.images.append(empty_img) + if len(sample.images) > 0 and not has_video: + has_image = True - conversation = [] # Note: Some tokenizers may ignore the system prompt. - conversation.append({"role": "system", "content": "Answer the questions."}) - - has_image_token = False - + conversation = [{"role": "system", "content": "Answer the questions."}] + # Format the conversation as a list of "user" / "assistant" turns. for text in sample.texts: - if IMAGE_TOKEN in text["value"]: - has_image_token = True - - if text["from"] == "human": - role = "user" - elif text["from"] == "gpt": - role = "assistant" - else: - raise RuntimeError(f"unexpected role {text['from']} in {sample.texts}") - - turn = {"role": role, "content": text["value"]} - conversation.append(turn) - - # If the sample contains an image but none of the user messages has an image token, - # then add it to the first user message. - if len(imgs) > 0 and not has_image_token: + error_msg = f"unexpected role {text['from']} in {sample.texts}" + assert text["from"] in ["human", "gpt"], error_msg + conversation.append({ + "role": "user" if text["from"] == "human" else "assistant", + "content": text["value"]}) + + # Replace the image tags with IMAGE_TOKEN and count the number of image tags + number_image_tags = 0 + image_tag_ids_list = [] + for turn in conversation: + if turn["role"] == "user": + image_tag_ids = [int(x) - 1 for x in re.findall(r"", turn["content"])] + image_tag_ids_list.extend(image_tag_ids) + turn["content"] = re.sub(r"", IMAGE_TOKEN, turn["content"]) + number_image_tags += turn["content"].count(IMAGE_TOKEN) + # For videos, we replace the image tag with the video tag + if has_video: + turn["content"] = turn["content"].replace(IMAGE_TOKEN, VIDEO_TOKEN) + + # We re-order the images in sample.images according to how they appear in the conversation. + if len(image_tag_ids_list) > 0: + sample.images = [sample.images[idx] for idx in image_tag_ids_list] + + # If there is only one image, but several image tags, we assume all the tags refer to the + # same image and duplicate the image: + if len(sample.images) == 1 and number_image_tags > 1: + sample.images = sample.images * number_image_tags + + number_of_images = len(sample.images) + # Fail if there are more image or video tags than image or videos: + error_msg = ( + f"Found {number_image_tags} image tags for {number_of_images} images. {sample.texts}") + assert number_image_tags <= number_of_images, error_msg + + # If there are less image of video tags than image or videos, prepend the tags to the first + # user message: + if number_image_tags < number_of_images: for turn in conversation: if turn["role"] == "user": - turn["content"] = f"{IMAGE_TOKEN}\n" + turn["content"] + tag_to_add = VIDEO_TOKEN if has_video else IMAGE_TOKEN + turn["content"] = tag_to_add*(number_of_images-number_image_tags) + "\n" + turn["content"] break input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False) + if has_image: + imgs = [] + num_tiles = [] + max_num_tiles = self.args.max_num_tiles + # We keep a buffer of 4 tokens for the question, + # the rest can be used for image tokens. + max_image_token_allowed = self.args.decoder_seq_length - len(input_ids) - 4 + # We start by extracting as many tiles per image as possible, and decrease the max + # number of tiles if there are too many image tokens. + while True: + imgs = [] + num_tiles = [] + for img in sample.images: + img_tiles = get_visual_transform( + img, self.img_h, self.img_w, self.args.use_tiling, max_num_tiles, + self.args.use_thumbnail, augment, self.args.vision_model_type) + imgs += img_tiles + num_tiles += [len(img_tiles)] + if max_num_tiles == 1: + break + if sum(num_tiles) * self.token_per_img_tile > max_image_token_allowed: + if max_num_tiles in self.num_tiles_degradation_map: + max_num_tiles = self.num_tiles_degradation_map[max_num_tiles] + else: + raise RuntimeError(( + f"Tried to decrease the number of tiles {max_num_tiles} but it's not ", + f"defined in the degradation map {self.num_tiles_degradation_map}")) + else: + break + elif has_video: + # We don't use tiling for videos to limit the number of tokens. + use_tiling=False + # Grab the selected frames of the video as a tensor with shape + # fhwc: (num_frames, num_channels, height, width). + video_fchw = sample.images[0].permute(0, 1, 2, 3) + selected_frames = torch.linspace( + 0, video_fchw.shape[0] - 1, self.args.num_frames).long() + video_fchw = video_fchw[selected_frames] + imgs = [] + for video_chw in video_fchw: + to_pil = ToPILImage() + video_chw = to_pil(video_chw) + imgs += get_visual_transform( + video_chw, self.img_h, self.img_w, use_tiling, self.args.max_num_tiles, + self.args.use_thumbnail, augment, self.args.vision_model_type) + num_tiles = [len(imgs)] + else: + imgs = num_tiles = [] + if self.is_packing_enabled: input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles) + # Some final checks with respect to the number of image tokens and images on the tokenized + # conversation. There can still be errors, for instance if a non-video sample happens to + # have our pre-defined video token, or if the packing truncation removed a necessary image + # tag. + number_image_token = np.sum(input_ids == self.img_token_id) + error_msg = ( + f"Found {number_image_token} image tokens for len({num_tiles}) = {len(num_tiles)} image tiles in {conversation}.") + assert number_image_token == len(num_tiles), error_msg + error_msg = ( + f"Found sum({num_tiles}) = {np.sum(num_tiles)} tiles for {len(imgs)} images in {conversation}.") + assert np.sum(num_tiles) == len(imgs), error_msg + return ImageTaskSample( __key__=sample.__key__, __restore_key__=sample.__restore_key__, @@ -407,8 +478,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, if isinstance(sample, MultiChoiceVQASample): cur_prompt = format_multichoice_question(sample.context, sample.choices) - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt cur_answer = format_multichoice_answer(sample.correct_choice_idx) elif isinstance(sample, VQASample): if 'docvqa' in sample.__key__: @@ -423,8 +494,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, cur_prompt = cur_prompt.format(sample.context) - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt if isinstance(sample.answers, list): answer_list = sample.answers @@ -505,11 +576,11 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, prompt_list = self.manual_prompts["DocPretraining"]["raw"] prompt_idx = np.random.randint(len(prompt_list)) cur_prompt = prompt_list[prompt_idx] - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt - # Make sure there is no extra tag. - sample.text = sample.text.replace("", "") + # Make sure there is no extra IMAGE_TOKEN tag. + sample.text = sample.text.replace(IMAGE_TOKEN, "") caption = sample.text.strip() @@ -526,8 +597,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, ref = sample.text region = sample.words_boxes - # Make sure there is no extra tag - ref = ref.replace("", "") + # Make sure there is no extra IMAGE_TOKEN tag + ref = ref.replace(IMAGE_TOKEN, "") if len(region) == 4: region = f"({region[0]},{region[1]}),({region[2]},{region[3]})" @@ -550,8 +621,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, prompt_idx = np.random.randint(len(prompt_list)) cur_prompt = prompt_list[prompt_idx] cur_prompt = cur_prompt.format(prompt_content) - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt return sample, cur_prompt, answer @@ -559,8 +630,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, """Format bbox coordinates as text.""" assert len(bbox) == 4 or len(bbox) == 8 - # Make sure there is no extra tag - text = text.replace("", "") + # Make sure there is no extra IMAGE_TOKEN tag + text = text.replace(IMAGE_TOKEN, "") if len(bbox) == 4: label_str = f"{text}({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]})" @@ -582,8 +653,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, prompt_idx = np.random.randint(len(prompt_list)) cur_prompt = prompt_list[prompt_idx] - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt cur_answer = answer return sample, cur_prompt, cur_answer diff --git a/examples/multimodal/evaluate_ai2d.py b/examples/multimodal/evaluation/evaluate_ai2d.py old mode 100755 new mode 100644 similarity index 72% rename from examples/multimodal/evaluate_ai2d.py rename to examples/multimodal/evaluation/evaluate_ai2d.py index 2d5db67b67d076e6d43a815997175325d5bb25ea..39b866ae4a030c2911a197fef6a1be0e19b0cfc4 --- a/examples/multimodal/evaluate_ai2d.py +++ b/examples/multimodal/evaluation/evaluate_ai2d.py @@ -9,19 +9,25 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="AI2D") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - results.append( - { - "question_id": res["sample_id"], - "answer": res["answer"], - "gt_answer": res["gt_answer"], - } - ) + sample_id = res["sample_id"] + + # Ignore possible duplicates. + if sample_id in results: + continue + + results[sample_id] = { + "question_id": sample_id, + "answer": res["answer"], + "gt_answer": res["gt_answer"], + } + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_chartqa.py b/examples/multimodal/evaluation/evaluate_chartqa.py old mode 100755 new mode 100644 similarity index 77% rename from examples/multimodal/evaluate_chartqa.py rename to examples/multimodal/evaluation/evaluate_chartqa.py index e9238069d463a038c0e1b52e571e930c01b24b6a..53d4944f46e364b4cb68f8ef22dabccbf66ef3ca --- a/examples/multimodal/evaluate_chartqa.py +++ b/examples/multimodal/evaluation/evaluate_chartqa.py @@ -9,15 +9,22 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="ChartQA") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - res["question_id"] = res["sample_id"] + sample_id = res["sample_id"] - results.append(res) + # Ignore possible duplicates. + if sample_id in results: + continue + + res["question_id"] = sample_id + results[sample_id] = res + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_coco.py b/examples/multimodal/evaluation/evaluate_coco.py old mode 100755 new mode 100644 similarity index 77% rename from examples/multimodal/evaluate_coco.py rename to examples/multimodal/evaluation/evaluate_coco.py index a717090c9274781f7aabd0f5cfbc3b8b032fc689..8eeb367e8f3bb0c38bd3b0f44b8f54f0c7d32636 --- a/examples/multimodal/evaluate_coco.py +++ b/examples/multimodal/evaluation/evaluate_coco.py @@ -11,20 +11,28 @@ def convert_to_coco_format(input_path): """Convert input files to COCO compatible format.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="captioning") - captions = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) + sample_id = res["sample_id"] - question_id = res['sample_id'] - caption = res['caption'].rstrip('.').lower() + # Ignore possible duplicates. + if sample_id in results: + continue - captions.append({"image_id": question_id, "caption": caption}) + caption = res["caption"].rstrip(".").lower() + results[sample_id] = { + "image_id": sample_id, + "caption": caption, + } + + results = list(results.values()) with open(output_file_path, "w") as output_file: - json.dump(captions, output_file, indent=4) + json.dump(results, output_file, indent=4) return output_file_path diff --git a/examples/multimodal/evaluate_mathvista.py b/examples/multimodal/evaluation/evaluate_mathvista.py old mode 100755 new mode 100644 similarity index 92% rename from examples/multimodal/evaluate_mathvista.py rename to examples/multimodal/evaluation/evaluate_mathvista.py index 3474c5f25e9e750ba4f77238b82ef8aaa4d7193b..a55f312f21986fb46644eb4e36979c342a2b7411 --- a/examples/multimodal/evaluate_mathvista.py +++ b/examples/multimodal/evaluation/evaluate_mathvista.py @@ -11,13 +11,21 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="MathVista") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - results.append(res) + sample_id = res["sample_id"] + + # Remove possible duplicates. + if sample_id in results: + continue + + results[sample_id] = res + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_mmmu.py b/examples/multimodal/evaluation/evaluate_mmmu.py old mode 100755 new mode 100644 similarity index 91% rename from examples/multimodal/evaluate_mmmu.py rename to examples/multimodal/evaluation/evaluate_mmmu.py index 66118fa905d69df3a1d2a07e9baa6236dd11d823..798c42bfa76009653927aa4f1339411807fb905f --- a/examples/multimodal/evaluate_mmmu.py +++ b/examples/multimodal/evaluation/evaluate_mmmu.py @@ -2,9 +2,15 @@ import argparse import glob import json import os +import sys import re import subprocess +# Get the absolute path of the parent directory +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) +# Add the parent directory to sys.path +sys.path.insert(0, parent_dir) + from run_text_generation import get_output_path from config import EvaluationConfig @@ -48,6 +54,10 @@ def convert_to_mmmu_format(input_path): ) # MMMU eval script expects just a sample_id to prediction mapping. + # Skip possible duplicates. + if sample_id in output: + continue + output[sample_id] = prediction with open(output_file_path, "w") as output_file: diff --git a/examples/multimodal/evaluate_ocrbench.py b/examples/multimodal/evaluation/evaluate_ocrbench.py old mode 100755 new mode 100644 similarity index 95% rename from examples/multimodal/evaluate_ocrbench.py rename to examples/multimodal/evaluation/evaluate_ocrbench.py index bc2b901065f53255a0cf4cabaa4893122d579566..b37473a67dbaeef121e734340a6161358ac0203b --- a/examples/multimodal/evaluate_ocrbench.py +++ b/examples/multimodal/evaluation/evaluate_ocrbench.py @@ -8,13 +8,21 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - results.append(res) + sample_id = res["sample_id"] + + # Remove possible duplicates. + if sample_id in results: + continue + + results[sample_id] = res + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluation/evaluate_textvqa.py old mode 100755 new mode 100644 similarity index 72% rename from examples/multimodal/evaluate_textvqa.py rename to examples/multimodal/evaluation/evaluate_textvqa.py index c9bba7134ba9f7e3a925dbcc529ec97da60fac92..af782bdf0318b664e37d9a106e36e66e5f5ad63c --- a/examples/multimodal/evaluate_textvqa.py +++ b/examples/multimodal/evaluation/evaluate_textvqa.py @@ -9,22 +9,25 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - results.append( - { - "question_id": res["sample_id"], - "answer": res["answer"], - "gt_answer": res["gt_answer"], - } - ) - - # Make order deterministic. - # results = sorted(results, key=lambda d: d["question_id"]) + sample_id = res["sample_id"] + + # Remove possible duplicates. + if sample_id in results: + continue + + results[sample_id] = { + "question_id": sample_id, + "answer": res["answer"], + "gt_answer": res["gt_answer"], + } + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluation/evaluate_vqav2.py old mode 100755 new mode 100644 similarity index 88% rename from examples/multimodal/evaluate_vqav2.py rename to examples/multimodal/evaluation/evaluate_vqav2.py index 0b1b9209bef3bfb5bd644ed28d5464c951965654..7807d80723f5aa67c7fcadd695e78643fd52cb6d --- a/examples/multimodal/evaluate_vqav2.py +++ b/examples/multimodal/evaluation/evaluate_vqav2.py @@ -9,15 +9,22 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - res["question_id"] = res["sample_id"] + sample_id = res["sample_id"] - results.append(res) + # Skip possible duplicates. + if sample_id in results: + continue + + res["question_id"] = sample_id + results[sample_id] = res + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) @@ -57,6 +64,9 @@ def compute_vqa_accuracy(result_file, task): assert len(gt) == 1, "expected exactly one groundtruth answer." gt = gt[0] + pred = pred.rstrip("%") + gt = gt.rstrip("%") + if is_number(pred) and is_number(gt): pred = float(pred) gt = float(gt) diff --git a/examples/multimodal/evaluation_datasets.py b/examples/multimodal/evaluation/evaluation_datasets.py old mode 100755 new mode 100644 similarity index 88% rename from examples/multimodal/evaluation_datasets.py rename to examples/multimodal/evaluation/evaluation_datasets.py index 97f9ba926f1435960444626c3af41496d1bea837..50a50d56871bddd9de59c3b1444186c749892db8 --- a/examples/multimodal/evaluation_datasets.py +++ b/examples/multimodal/evaluation/evaluation_datasets.py @@ -188,7 +188,7 @@ class MMMUDataset(torch.utils.data.Dataset): use_tiling, max_num_tiles, use_thumbnail, - single_image, + prompt_style, vision_model_type, ): import datasets @@ -246,7 +246,7 @@ class MMMUDataset(torch.utils.data.Dataset): self._use_tiling = use_tiling self._max_num_tiles = max_num_tiles self._use_thumbnail = use_thumbnail - self._single_image = single_image + self._prompt_style = prompt_style self._vision_model_type = vision_model_type def __len__(self): @@ -258,7 +258,7 @@ class MMMUDataset(torch.utils.data.Dataset): sample = self._dataset[idx] # Use the single image approach from the MMMU repo. - if self._single_image: + if self._prompt_style == "single_image": sample = process_single_sample(sample) sample = construct_prompt(sample, self._config) @@ -274,7 +274,69 @@ class MMMUDataset(torch.utils.data.Dataset): vision_model_type=self._vision_model_type, ) sample_num_tiles = [len(sample_imgs)] - else: + + prompt = sample["final_input_prompt"] + for i in range(8): + prompt = prompt.replace(f"", "") + sample["final_input_prompt"] = f"\n{prompt}" + elif self._prompt_style == "vlmevalkit": + sample = construct_prompt(sample, self._config) + + if sample["question_type"] == "multiple-choice": + question = sample["question"] + + options = "" + for k, v in sample["index2ans"].items(): + options += f"{k}. {v}\n" + + final_prompt = f"{question}\n" + if "hint" in sample: + final_prompt += f"Hint: {sample['hint']}\n" + + if "task_instructions" in sample: + final_prompt += f"Task instructions: {sample['task_instructions']}\n" + + final_prompt += options + final_prompt += "Answer with the option's letter from the given choices directly." + + sample["final_input_prompt"] = final_prompt.rstrip() + else: + question = sample["question"] + final_prompt = f"{question}\n" + final_prompt += "Answer the question directly." + sample["final_input_prompt"] = final_prompt.rstrip() + + sample_imgs = [] + sample_num_tiles = [] + + img_indices = sorted(list(set(re.findall(r"" + + img = sample[img_key] + assert img is not None, f"{img_str} is in prompt but not in sample images" + + imgs = get_visual_transform( + img, + self._img_h, + self._img_w, + self._use_tiling, + adjusted_max_num_tiles, + self._use_thumbnail, + augment=False, + vision_model_type=self._vision_model_type, + ) # List of tiles. + + sample_imgs.extend(imgs) + sample_num_tiles.append(len(imgs)) + + sample["final_input_prompt"] = " ".join([f'' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"] + elif self._prompt_style == "multi_image": sample = construct_prompt(sample, self._config) sample_imgs = [] @@ -315,6 +377,8 @@ class MMMUDataset(torch.utils.data.Dataset): assert ( f"" not in sample["final_input_prompt"] ), "prompt contains unhandled image tags" + else: + raise ValueError(f"unknown prompt style {self._prompt_style}") # MMMU specific metadata. metadata = {"question_type": sample["question_type"]} @@ -323,10 +387,6 @@ class MMMUDataset(torch.utils.data.Dataset): metadata["all_choices"] = sample["all_choices"] prompt = sample['final_input_prompt'] - if self._single_image: - for i in range(8): - prompt = prompt.replace(f"", "") - prompt = f"\n{prompt}" tile_count = torch.tensor(sample_num_tiles, dtype=torch.int) @@ -780,8 +840,10 @@ def get_evaluation_dataset( vision_model_type, ) elif task == 'MMMU': - # Note: single_image=True uses only one image like in the MMMU repo example. - # single_image=False uses all images in the sample. + # Note: + # - prompt_style="single_image" uses only one image like in the MMMU repo example. + # - prompt_style="multi_image" uses multiple input images. + # - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499 dataset = MMMUDataset( input_image_path, num_samples_per_partition, @@ -792,7 +854,7 @@ def get_evaluation_dataset( use_tiling, max_num_tiles, use_thumbnail, - single_image=True, + prompt_style="single_image", vision_model_type=vision_model_type, ) elif task == "VideoMME": diff --git a/examples/multimodal/image_processing.py b/examples/multimodal/image_processing.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/manual_prompts.json b/examples/multimodal/manual_prompts.json old mode 100755 new mode 100644 diff --git a/examples/multimodal/model.py b/examples/multimodal/model.py old mode 100755 new mode 100644 index 6db834e97a1d643955cf12905eb3ed84f0541a08..a28a428325b8db9c7c1268080979889935dcc396 --- a/examples/multimodal/model.py +++ b/examples/multimodal/model.py @@ -136,6 +136,20 @@ def model_provider( else: vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules + # Toggle --recompute* for the vision and language model separately. + if args.recompute_vision: + if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None: + vision_config.recompute_num_layers = vision_config.num_layers + else: + vision_config.recompute_granularity = None + vision_config.recompute_method = None + vision_config.recompute_num_layers = None + + vision_projection_config.recompute_granularity = None + vision_projection_config.recompute_method = None + vision_projection_config.recompute_num_layers = None + + tokenizer = get_tokenizer() image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) diff --git a/examples/multimodal/model_converter/clip_converter.py b/examples/multimodal/model_converter/clip_converter.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/model_converter/internvit_converter.py b/examples/multimodal/model_converter/internvit_converter.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/model_converter/siglip_converter.py b/examples/multimodal/model_converter/siglip_converter.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/model_converter/vision_model_tester.py b/examples/multimodal/model_converter/vision_model_tester.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/multimodal_args.py b/examples/multimodal/multimodal_args.py old mode 100755 new mode 100644 index 4b2be450afb33ce985ea052c55ad01abc5a3c548..eb56118e71613ea7fae6f81ff44f2969f26b4533 --- a/examples/multimodal/multimodal_args.py +++ b/examples/multimodal/multimodal_args.py @@ -49,7 +49,7 @@ def add_multimodal_extra_args(parser): group.add_argument( "--tokenizer-prompt-format", type=str, - choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0"], + choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"], required=True, help="Prompt format to use with the tokenizer.", ) @@ -71,5 +71,9 @@ def add_multimodal_extra_args(parser): group.add_argument( "--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing." ) + group.add_argument( + "--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model" + ) + return parser diff --git a/examples/multimodal/nvlm/README.md b/examples/multimodal/nvlm/README.md old mode 100755 new mode 100644 index 7eddbb7efa9162edb02e118ce7bb5d95151ca944..bb576bb40355a02fbe2701fdaf85d6ee9a8058e3 --- a/examples/multimodal/nvlm/README.md +++ b/examples/multimodal/nvlm/README.md @@ -5,6 +5,13 @@ Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details. *NOTE: VLMs in Megatron are under active development and are expected to change.* +# Checkpoints + +NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format. + +- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B) +- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore) + # Setup ## Docker image @@ -32,7 +39,7 @@ NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface Please download it and run the following command to convert it to Megatron format. ``` python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \ - --load-dir --save-dir --tokenizer-model \ + --load-dir --save-dir --tokenizer-model \ --saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1 ``` @@ -42,7 +49,7 @@ NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Q Please download it and run the following command to convert it to Megatron format. ``` python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \ - --load-dir --save-dir --tokenizer-model \ + --load-dir --save-dir --tokenizer-model \ --saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf ``` diff --git a/examples/multimodal/nvlm/internvit.py b/examples/multimodal/nvlm/internvit.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/nvlm_prompts.json b/examples/multimodal/nvlm/nvlm_prompts.json old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/pp_checkpoint_converter.py b/examples/multimodal/nvlm/pp_checkpoint_converter.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/pretrain_blend.yaml b/examples/multimodal/nvlm/pretrain_blend.yaml old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh b/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh old mode 100755 new mode 100644 index 320c7ad3f517a10db6556ca28b363059d3a04f6b..008a17ac43d936c79c5cd655e57509fe2abf8ec9 --- a/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh +++ b/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh @@ -62,7 +62,7 @@ OPTIONS=" \ --exit-duration-in-mins 230 \ --disable-bias-linear \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/ \ + --tokenizer-model Qwen/Qwen2-72B-Instruct \ --tokenizer-prompt-format qwen2p0 \ --transformer-impl transformer_engine \ --normalization RMSNorm \ diff --git a/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh b/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh old mode 100755 new mode 100644 index c36cb05990cf36ddb2c952630eaf9d55afc76f28..00f94352774518b1c8dc478c98808a16a3398b75 --- a/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh +++ b/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh @@ -75,7 +75,7 @@ OPTIONS=" \ --decoder-seq-length ${DECODER_SEQ_LEN} \ --max-position-embeddings ${MAX_POS_EMBED} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/ \ + --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \ --tokenizer-prompt-format nvlm-yi-34b \ --vocab-size 64000 \ --make-vocab-size-divisible-by 1 \ diff --git a/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh b/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh old mode 100755 new mode 100644 index 35cd90409a98948a9e7287a92431cac9614f4e95..e3b001c7aaee4544fde590ee41a8ae0d01497d36 --- a/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh +++ b/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh @@ -97,7 +97,7 @@ do --decoder-seq-length ${DECODER_SEQ_LEN} \ --max-position-embeddings ${MAX_POS_EMBED} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model \ + --tokenizer-model Qwen/Qwen2-72B-Instruct \ --tokenizer-prompt-format qwen2p0 \ --position-embedding-type rope \ --rotary-percent 1.0 \ diff --git a/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh b/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh new file mode 100644 index 0000000000000000000000000000000000000000..3b6221996c8294790b946f3c453d01eb71b692e7 --- /dev/null +++ b/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_APPLY_QK_LAYER_SCALING=0 +export TOKENIZERS_PARALLELISM="false" + +INPUT_IMAGE_PATH="placeholder" +GROUNDTRUTH_PATH="placeholder" + +while [[ $# -gt 0 ]]; do + case $1 in + -i|--input-image-path) + INPUT_IMAGE_PATH="$2" + shift + shift + ;; + -o|--output-path) + OUTPUT_PATH="$2" + shift + shift + ;; + -m|--model-path) + MODEL_PATH="$2" + shift + shift + ;; + -t|--task) + TASK="$2" + shift + shift + ;; + -g|--gt-path) + GROUNDTRUTH_PATH="$2" + shift + shift + ;; + -*|--*) + echo "Invalid option $1" + exit 1 + ;; + esac +done + +# Please modify these as needed. +NUM_PARTITIONS=0 +START=0 +END=0 + + +SEQ_LEN=256 +DECODER_SEQ_LEN=8192 +EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail" + +for PARTITION_ID in $( eval echo {$START..$END} ) +do + torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \ + --attention-softmax-in-fp32 \ + --transformer-impl transformer_engine \ + --use-te \ + --use-checkpoint-args \ + --normalization RMSNorm \ + --norm-epsilon 1e-06 \ + --language-model-type=qwen2.5_7B \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 1000000 \ + --swiglu \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 1 \ + --group-query-attention \ + --num-query-groups 4 \ + --num-layers 28 \ + --hidden-size 3584 \ + --ffn-hidden-size 18944 \ + --add-qkv-bias \ + --num-attention-heads 28 \ + --max-position-embeddings 32768 \ + --no-masked-softmax-fusion \ + --load ${MODEL_PATH} \ + --tokenizer-type MultimodalTokenizer \ + --tokenizer-model Qwen/Qwen2.5-7B-Instruct \ + --tokenizer-prompt-format qwen2p5 \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length ${SEQ_LEN} \ + --decoder-seq-length ${DECODER_SEQ_LEN} \ + --out-seq-length 128 \ + --temperature 1.0 \ + --img-h 448 \ + --img-w 448 \ + --patch-dim 14 \ + --seed 153 \ + --top_k 1 \ + --no-load-rng \ + --no-load-optim \ + --input-image-path ${INPUT_IMAGE_PATH} \ + --num-partitions ${NUM_PARTITIONS} \ + --partition-id ${PARTITION_ID} \ + --output-path ${OUTPUT_PATH} \ + --gt-path ${GROUNDTRUTH_PATH} \ + --task ${TASK} \ + ${EXTRA_ARGS} \ + --special-tokens "" "" "" \ + --vision-model-type siglip \ + --ckpt-format torch +done diff --git a/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh b/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh old mode 100755 new mode 100644 index 0437e4c16d68378a39b24aa9e7d08cc05b815e5b..341f4e4b0a79b3f212996672c0aee0e1c85f4ef3 --- a/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh +++ b/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh @@ -95,7 +95,7 @@ do --decoder-seq-length ${DECODER_SEQ_LEN} \ --max-position-embeddings ${MAX_POS_EMBED} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model \ + --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \ --tokenizer-prompt-format nvlm-yi-34b \ --vocab-size 64000 \ --make-vocab-size-divisible-by 1 \ @@ -135,6 +135,6 @@ do --gt-path ${GROUNDTRUTH_PATH} \ ${EXTRA_ARGS} \ --task ${TASK} \ - --image-tag-type nlvm \ + --image-tag-type nvlm \ --ckpt-format torch done diff --git a/examples/multimodal/nvlm/sft_34b_internvit.sh b/examples/multimodal/nvlm/sft_34b_internvit.sh old mode 100755 new mode 100644 index 3d585d8d37233a2322ba169f9b6bd86006d35c73..0dff9461dae1f38255093afc893ad1110bc5ad6b --- a/examples/multimodal/nvlm/sft_34b_internvit.sh +++ b/examples/multimodal/nvlm/sft_34b_internvit.sh @@ -80,7 +80,7 @@ OPTIONS=" \ --decoder-seq-length ${DECODER_SEQ_LEN} \ --max-position-embeddings ${MAX_POS_EMBED} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/ \ + --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \ --tokenizer-prompt-format nvlm-yi-34b \ --vocab-size 64000 \ --make-vocab-size-divisible-by 1 \ diff --git a/examples/multimodal/nvlm/sft_blend.yaml b/examples/multimodal/nvlm/sft_blend.yaml old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh b/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh old mode 100755 new mode 100644 index adb1d1b14c34e7e2774ad8a60cdd6ca5e47f103f..3b472259b94cb8ebe1e29a4695f594247af113a9 --- a/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh +++ b/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh @@ -67,7 +67,7 @@ OPTIONS=" \ --exit-duration-in-mins 230 \ --disable-bias-linear \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/ \ + --tokenizer-model Qwen/Qwen2-72B-Instruct \ --tokenizer-prompt-format qwen2p0 \ --transformer-impl transformer_engine \ --normalization RMSNorm \ diff --git a/examples/multimodal/pretrain_dataset.yaml b/examples/multimodal/pretrain_dataset.yaml old mode 100755 new mode 100644 diff --git a/examples/multimodal/pretrain_mistral_clip.sh b/examples/multimodal/pretrain_mistral_clip.sh old mode 100755 new mode 100644 index ea1f741aed91493f192e82f78279497c8cf4d535..90b0053d19fd3d556d336093afc3414425eb8664 --- a/examples/multimodal/pretrain_mistral_clip.sh +++ b/examples/multimodal/pretrain_mistral_clip.sh @@ -24,11 +24,6 @@ if [[ -z $LOAD_NAME ]]; then exit 1 fi -if [[ -z $TOKENIZER_MODEL ]]; then - echo "Please set TOKENIZER_MODEL for tokenizer model name." - exit 1 -fi - CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml" @@ -93,7 +88,7 @@ OPTIONS=" \ --eval-iters 10 \ --eval-interval 1000 \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \ + --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \ --tokenizer-prompt-format mistral \ --data-path ${DATA_TRAIN} \ --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py old mode 100755 new mode 100644 index f4bb5025ff780d0599a9be6fc6a82221f700cbfd..cbde6680cc26bb9b3b7d4592d466906452064e6d --- a/examples/multimodal/run_text_generation.py +++ b/examples/multimodal/run_text_generation.py @@ -14,11 +14,13 @@ sys.path.append( import torch import yaml from config import EvaluationConfig -from evaluation_datasets import get_evaluation_dataset +from evaluation.evaluation_datasets import get_evaluation_dataset from model import model_provider from multimodal_args import add_multimodal_extra_args from megatron.core import parallel_state +from megatron.core.enums import ModelType +from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings from megatron.inference.text_generation.api import generate_and_post_process from megatron.inference.text_generation.forward_step import ForwardStep @@ -36,7 +38,7 @@ def add_text_generation_args(parser): group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') group.add_argument("--top_k", type=int, default=0, help='Top k sampling.') group.add_argument( - "--out-seq-length", type=int, default=1024, help='Length of the output generated text.' + "--out-seq-length", type=int, default=128, help='Length of the output generated text.' ) group.add_argument("--output-path", type=str, help='Output file path') group.add_argument('--input-image-path', type=str, help="Input image directory") @@ -206,8 +208,8 @@ def generate_samples(model, config: EvaluationConfig, print_output): if config.task == "VideoMME": output["questions"][0][output_name] = generated else: - output[output_name] = generated output["prompt"] = prompt + output[output_name] = generated if config.task == "captioning": output["ground_truth"] = answers @@ -354,7 +356,7 @@ class VLMForwardStep(ForwardStep): ) def __call__(self, tokens, position_ids, attention_mask): - num_image_tokens = (tokens == self.model.image_token_index).sum().item() + num_image_tokens = (tokens == self.model.module.image_token_index).sum().item() num_tokens = tokens.size(1) recv_buffer_seq_length = None if num_image_tokens > 0: @@ -406,7 +408,7 @@ def get_conversation(task, question): {"role": "system", "content": "Answer the questions."}, { "role": "user", - "content": "\nProvide a one-sentence caption for provided image.", + "content": f"{IMAGE_TOKEN}\nProvide a one-sentence caption for provided image.", }, ] elif task in ("TextVQA", "VQAv2", "ChartQA"): @@ -414,13 +416,13 @@ def get_conversation(task, question): {"role": "system", "content": "Answer the questions."}, { "role": "user", - "content": f"\n{question}\nAnswer the question using a single word or phrase.", + "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.", }, ] elif task in ("OCRBench", "MathVista", "AI2D"): conversation = [ {"role": "system", "content": "Answer the questions."}, - {"role": "user", "content": f"\n{question}"}, + {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"}, ] elif task == "MMMU": conversation = [ @@ -441,7 +443,7 @@ def get_conversation(task, question): conversation = [ {"role": "system", "content": "Answer the questions."}, - {"role": "user", "content": f"\n{question}"}, + {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"}, ] return conversation @@ -464,11 +466,13 @@ def get_prompt_and_generated(prompt_and_generation, prompt_format): prompt = splitted[0] generated = splitted[1] generated = generated.split("<|im_end|>")[0] - elif prompt_format in ("nvlm-yi-34b", "qwen2p0"): + elif prompt_format in ("nvlm-yi-34b", "qwen2p0", "qwen2p5"): splitted = prompt_and_generation.split("<|im_start|>assistant\n") prompt = splitted[0] generated = splitted[1] generated = generated.split("<|im_end|>")[0] + else: + raise ValueError(f"Prompt format {prompt_format} is not supported.") # Remove possible garbage. generated = generated.strip() @@ -489,11 +493,11 @@ def main(): args = get_args() - def wrapped_model_provider(pre_process, post_process): - return model_provider(pre_process, post_process, parallel_output=False) + def wrapped_model_provider(pre_process, post_process, add_encoder, add_decoder): + return model_provider(pre_process, post_process, add_encoder, add_decoder, parallel_output=False) # Set up model and load checkpoint. - model = get_model(wrapped_model_provider, wrap_with_ddp=False) + model = get_model(wrapped_model_provider, model_type=ModelType.encoder_and_decoder, wrap_with_ddp=False) if args.load is not None: _ = load_checkpoint(model, None, None) diff --git a/examples/multimodal/sft_dataset.yaml b/examples/multimodal/sft_dataset.yaml old mode 100755 new mode 100644 diff --git a/examples/multimodal/sft_mistral_clip.sh b/examples/multimodal/sft_mistral_clip.sh old mode 100755 new mode 100644 index 8a083cc1f2e81e3c8340957d1d9fd1636bf117f3..94ff208eb4df632b597daa49bc3a1fbff62fe8d1 --- a/examples/multimodal/sft_mistral_clip.sh +++ b/examples/multimodal/sft_mistral_clip.sh @@ -29,11 +29,6 @@ if [[ -z $LOAD_ITER ]]; then exit 1 fi -if [[ -z $TOKENIZER_MODEL ]]; then - echo "Please set TOKENIZER_MODEL for tokenizer model name." - exit 1 -fi - CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml" @@ -98,7 +93,7 @@ OPTIONS=" \ --eval-iters 10 \ --eval-interval 500 \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \ + --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \ --tokenizer-prompt-format mistral \ --data-path ${DATA_TRAIN} \ --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ diff --git a/examples/multimodal/text_generation_mistral_clip.sh b/examples/multimodal/text_generation_mistral_clip.sh old mode 100755 new mode 100644 index ca98ff277a3729646a63f6de0958f323ed8e2276..c1ef7bcee897812fef976531e2a5bba961141b42 --- a/examples/multimodal/text_generation_mistral_clip.sh +++ b/examples/multimodal/text_generation_mistral_clip.sh @@ -4,12 +4,13 @@ export NCCL_IB_SL=1 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NVTE_APPLY_QK_LAYER_SCALING=0 +INPUT_IMAGE_PATH="placeholder" GROUNDTRUTH_PATH="placeholder" NUM_FRAMES=1 while [[ $# -gt 0 ]]; do case $1 in - --input-image-path) + -i|--input-image-path) INPUT_IMAGE_PATH="$2" shift shift @@ -19,11 +20,6 @@ while [[ $# -gt 0 ]]; do shift shift ;; - -g|--groundtruth-path) - GROUNDTRUTH_PATH="$2" - shift - shift - ;; -o|--output-path) OUTPUT_PATH="$2" shift @@ -34,12 +30,7 @@ while [[ $# -gt 0 ]]; do shift shift ;; - -t|--tokenizer-path) - TOKENIZER_PATH="$2" - shift - shift - ;; - --task) + -t|--task) TASK="$2" shift shift @@ -92,7 +83,7 @@ do --no-masked-softmax-fusion \ --load ${MODEL_PATH} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${TOKENIZER_PATH} \ + --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \ --tokenizer-prompt-format mistral \ --bf16 \ --micro-batch-size 1 \ diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py old mode 100755 new mode 100644 index 5ff2121b3d04c1a0f4f0733aac6526a65956c66d..1dc68d1173bfee00dd77d971c2b150b024acf421 --- a/examples/multimodal/train.py +++ b/examples/multimodal/train.py @@ -48,7 +48,7 @@ def get_batch(data_iterator): pp_size = get_pipeline_model_parallel_world_size() if not is_first_or_last_stage(pp_size, args.encoder_pipeline_model_parallel_size): # Note these are all set to None above. - return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles + return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params # Broadcast data. torch.cuda.nvtx.range_push("get_data") @@ -66,7 +66,7 @@ def get_batch(data_iterator): cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"] max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"] - # Dummy image, no image. + # No image input (text-only sample) if the dataloader produced a dummy image. if imgs.shape == torch.Size([1, 1]): # FIXME: text-only data can cause a hang if the vision model is own its own pipeline rank and --freeze-ViT is enabled. imgs = torch.tensor([], dtype=torch.float32, device=data_text.device) diff --git a/examples/retro/README.md b/examples/retro/README.md old mode 100755 new mode 100644 diff --git a/examples/retro/preprocess_data.sh b/examples/retro/preprocess_data.sh old mode 100755 new mode 100644 diff --git a/examples/retro/train_retro_2b_distributed.sh b/examples/retro/train_retro_2b_distributed.sh old mode 100755 new mode 100644 diff --git a/examples/run_simple_mcore_train_loop.py b/examples/run_simple_mcore_train_loop.py old mode 100755 new mode 100644 diff --git a/examples/t5/README.md b/examples/t5/README.md old mode 100755 new mode 100644 diff --git a/examples/t5/t5_mcore_train_curve.png b/examples/t5/t5_mcore_train_curve.png old mode 100755 new mode 100644 diff --git a/examples/t5/train_t5_220m_distributed.sh b/examples/t5/train_t5_220m_distributed.sh old mode 100755 new mode 100644 diff --git a/hostfile_gptmoe b/hostfile_gptmoe new file mode 100644 index 0000000000000000000000000000000000000000..7dd871e4a7f81126a9ceef3fab50472a07b82f91 --- /dev/null +++ b/hostfile_gptmoe @@ -0,0 +1,16 @@ +node007 slots=8 +node008 slots=8 +node009 slots=8 +node010 slots=8 +node011 slots=8 +node012 slots=8 +node013 slots=8 +node014 slots=8 +node015 slots=8 +node016 slots=8 +node017 slots=8 +node018 slots=8 +node019 slots=8 +node020 slots=8 +node021 slots=8 +node022 slots=8 diff --git a/hostfile_mixtral8x7B b/hostfile_mixtral8x7B new file mode 100644 index 0000000000000000000000000000000000000000..582f84799c0e467438b171420cf9f68a5d726720 --- /dev/null +++ b/hostfile_mixtral8x7B @@ -0,0 +1,2 @@ +node007 slots=8 +node008 slots=8 diff --git a/images/model_table.png b/images/model_table.png old mode 100755 new mode 100644 diff --git a/images/strong_scaling.png b/images/strong_scaling.png old mode 100755 new mode 100644 diff --git a/images/weak_scaling.png b/images/weak_scaling.png old mode 100755 new mode 100644 diff --git a/kill.sh b/kill.sh new file mode 100644 index 0000000000000000000000000000000000000000..688f93a6ec728bbc159d533e050cc5e65d0d00bf --- /dev/null +++ b/kill.sh @@ -0,0 +1,3 @@ +ps -ef | grep python3 | grep -v grep | awk '{print "kill "$2}' >& .tmp +bash .tmp + diff --git a/megatron/core/QuickStart.md b/megatron/core/QuickStart.md old mode 100755 new mode 100644 diff --git a/megatron/core/README.md b/megatron/core/README.md old mode 100755 new mode 100644 diff --git a/megatron/core/README_STRAGGLER.md b/megatron/core/README_STRAGGLER.md old mode 100755 new mode 100644 diff --git a/megatron/core/__init__.py b/megatron/core/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/config_logger.py b/megatron/core/config_logger.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/Makefile b/megatron/core/datasets/Makefile old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/__init__.py b/megatron/core/datasets/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/bert_dataset.py b/megatron/core/datasets/bert_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/blended_dataset.py b/megatron/core/datasets/blended_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/blended_megatron_dataset_config.py b/megatron/core/datasets/blended_megatron_dataset_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/helpers.cpp b/megatron/core/datasets/helpers.cpp old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/helpers.py b/megatron/core/datasets/helpers.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/helpers_cpp.cpython-310-x86_64-linux-gnu.so_bak b/megatron/core/datasets/helpers_cpp.cpython-310-x86_64-linux-gnu.so_bak new file mode 100755 index 0000000000000000000000000000000000000000..c398ffc62ef7822e975f481e70ff69ece755f775 Binary files /dev/null and b/megatron/core/datasets/helpers_cpp.cpython-310-x86_64-linux-gnu.so_bak differ diff --git a/megatron/core/datasets/indexed_dataset.py b/megatron/core/datasets/indexed_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/masked_dataset.py b/megatron/core/datasets/masked_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/megatron_dataset.py b/megatron/core/datasets/megatron_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/megatron_tokenizer.py b/megatron/core/datasets/megatron_tokenizer.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/multimodal_dataset.py b/megatron/core/datasets/multimodal_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/readme.md b/megatron/core/datasets/readme.md old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/__init__.py b/megatron/core/datasets/retro/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/__init__.py b/megatron/core/datasets/retro/config/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/bert_embedders.py b/megatron/core/datasets/retro/config/bert_embedders.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/config.py b/megatron/core/datasets/retro/config/config.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/gpt_chunk_datasets.py b/megatron/core/datasets/retro/config/gpt_chunk_datasets.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/tokenizers.py b/megatron/core/datasets/retro/config/tokenizers.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/db/__init__.py b/megatron/core/datasets/retro/db/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/db/build.py b/megatron/core/datasets/retro/db/build.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/db/dataset.py b/megatron/core/datasets/retro/db/dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/db/utils.py b/megatron/core/datasets/retro/db/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/external_libs.py b/megatron/core/datasets/retro/external_libs.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/__init__.py b/megatron/core/datasets/retro/index/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/build.py b/megatron/core/datasets/retro/index/build.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/factory.py b/megatron/core/datasets/retro/index/factory.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/index.py b/megatron/core/datasets/retro/index/index.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/indexes/__init__.py b/megatron/core/datasets/retro/index/indexes/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/indexes/faiss_base.py b/megatron/core/datasets/retro/index/indexes/faiss_base.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/indexes/faiss_par_add.py b/megatron/core/datasets/retro/index/indexes/faiss_par_add.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/utils.py b/megatron/core/datasets/retro/index/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/validate.py b/megatron/core/datasets/retro/index/validate.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/__init__.py b/megatron/core/datasets/retro/query/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/gpt_chunk_dataset.py b/megatron/core/datasets/retro/query/gpt_chunk_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/query.py b/megatron/core/datasets/retro/query/query.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/retro_dataset.py b/megatron/core/datasets/retro/query/retro_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/utils.py b/megatron/core/datasets/retro/query/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/utils.py b/megatron/core/datasets/retro/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/t5_dataset.py b/megatron/core/datasets/t5_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/utils.py b/megatron/core/datasets/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/utils_s3.py b/megatron/core/datasets/utils_s3.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/__init__.py b/megatron/core/dist_checkpointing/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/core.py b/megatron/core/dist_checkpointing/core.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/dict_utils.py b/megatron/core/dist_checkpointing/dict_utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/exchange_utils.py b/megatron/core/dist_checkpointing/exchange_utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/mapping.py b/megatron/core/dist_checkpointing/mapping.py old mode 100755 new mode 100644 index 2ddfcf3b315a912ef523dc118933644ccbdc97f6..d376c6374baf7053b549f0686713e0e1c672d7b2 --- a/megatron/core/dist_checkpointing/mapping.py +++ b/megatron/core/dist_checkpointing/mapping.py @@ -119,7 +119,8 @@ class ShardedTensor(ShardedBase): self.init_data(device='meta') if self.data.shape != real_data.shape: raise CheckpointingException( - f'Data shape doesnt match expected {self.data.shape} for {self}' + f'Data shape {real_data.shape} doesnt match' + f' expected {self.data.shape} for {self}' ) finally: self.data = real_data diff --git a/megatron/core/dist_checkpointing/optimizer.py b/megatron/core/dist_checkpointing/optimizer.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/serialization.py b/megatron/core/dist_checkpointing/serialization.py old mode 100755 new mode 100644 index 3be5777e747b66742a67a9cc6279961a4b516de5..600dd87e5438620a9214d97f2c688f056e5c4aef --- a/megatron/core/dist_checkpointing/serialization.py +++ b/megatron/core/dist_checkpointing/serialization.py @@ -104,8 +104,6 @@ def load( checkpoint_dir = Path(checkpoint_dir) common_state_dict = common_strategy.load_common(checkpoint_dir) - if not sharded_state_dict: - return common_state_dict sharded_state_dict, nonpersistent_state_dict, sh_ten_factories = load_preprocess( sharded_state_dict diff --git a/megatron/core/dist_checkpointing/state_dict_transformation.py b/megatron/core/dist_checkpointing/state_dict_transformation.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/__init__.py b/megatron/core/dist_checkpointing/strategies/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/async_utils.py b/megatron/core/dist_checkpointing/strategies/async_utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/base.py b/megatron/core/dist_checkpointing/strategies/base.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/common.py b/megatron/core/dist_checkpointing/strategies/common.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/filesystem_async.py b/megatron/core/dist_checkpointing/strategies/filesystem_async.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/fully_parallel.py b/megatron/core/dist_checkpointing/strategies/fully_parallel.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/resharding.py b/megatron/core/dist_checkpointing/strategies/resharding.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/state_dict_saver.py b/megatron/core/dist_checkpointing/strategies/state_dict_saver.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/tensorstore.py b/megatron/core/dist_checkpointing/strategies/tensorstore.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/two_stage.py b/megatron/core/dist_checkpointing/strategies/two_stage.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/zarr.py b/megatron/core/dist_checkpointing/strategies/zarr.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/utils.py b/megatron/core/dist_checkpointing/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py old mode 100755 new mode 100644 index 48e023dc3945ff05cd9d009fbe296ce328d0937f..546ec3547f9144bcfaa6c3dcc88e3e7011c10c05 --- a/megatron/core/dist_checkpointing/validation.py +++ b/megatron/core/dist_checkpointing/validation.py @@ -412,7 +412,7 @@ def validate_sharding_integrity( CheckpointingException for invalid access pattern """ - if common_state_dict: + if common_state_dict is not None: _validate_common_state_dict(common_state_dict) if torch.distributed.get_rank() != 0: @@ -461,10 +461,15 @@ def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]): lambda x: x[1], _validate_sharding_for_key_flattened, ) - else: - if not torch.all(shard_access_cnt == 1): - logger.error(f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}') - raise CheckpointingException(f'Invalid access pattern for {rank_sharding[0][1]}') + # For each shard with at least 1 flattened tensor in it, the above + # `_validate_sharding_for_key_flattened` ensure a correct consistent pattern + # The only thing that can go wrong at this point is that some shard don't have + # *any* representatives which will be checked later by comparing `shard_access_cnt == 1` + shard_access_cnt = torch.minimum(shard_access_cnt, torch.tensor([1])) + if not torch.all(shard_access_cnt == 1): + raise CheckpointingException( + f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}' + ) def _compute_shards_access(rank_sharding): @@ -489,16 +494,10 @@ def _validate_sharding_for_key_flattened(tensors_by_shard): all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop)) starts, stops = map(np.asarray, zip(*sorted(all_slices))) - if ( - starts[0] != 0 - or stops[-1] != np.product(local_shape) - or not np.all(starts[1:] == stops[:-1]) - ): - logger.error( - f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}' - ) + expected_size = np.product(local_shape) + if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]): raise CheckpointingException( - f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}' + f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}' ) diff --git a/megatron/core/distributed/README.md b/megatron/core/distributed/README.md old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/__init__.py b/megatron/core/distributed/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/data_parallel_base.py b/megatron/core/distributed/data_parallel_base.py old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py old mode 100755 new mode 100644 index 3a23426eca03f97c8ac88b131e3d1f50cde86e62..ea08db6c127e6413a365ea8b51139ac698dd8c27 --- a/megatron/core/distributed/distributed_data_parallel.py +++ b/megatron/core/distributed/distributed_data_parallel.py @@ -7,6 +7,7 @@ import torch from .. import parallel_state from ..config_logger import has_config_logger_enabled, log_config_to_disk +from ..transformer.cuda_graphs import is_graph_capturing from ..transformer.transformer_config import TransformerConfig from ..utils import is_float8tensor, log_single_rank from .data_parallel_base import _BaseDataParallel @@ -151,12 +152,20 @@ class DistributedDataParallel(_BaseDataParallel): with_context_parallel=True ) if self.ddp_config.average_in_collective: - # Collective is averaging gradients in collective with data_parallel_group. - assert ( - gradient_scaling_factor - / parallel_state.get_data_parallel_world_size(with_context_parallel=True) - == target_gradient_scaling_factor - ) + if self.ddp_config.num_distributed_optimizer_instances == 1: + # Collective is averaging gradients in collective with data_parallel_group. + assert ( + gradient_scaling_factor + / torch.distributed.get_world_size(group=data_parallel_group) + == target_gradient_scaling_factor + ) + else: + # For non-expert parameters, gradient_scaling_factor is 1. + # For expert parameters, gradient_scaling_factor is 1/ep_size. + assert (gradient_scaling_factor == 1) or ( + gradient_scaling_factor + == (1.0 / parallel_state.get_expert_model_parallel_world_size()) + ) else: assert gradient_scaling_factor == target_gradient_scaling_factor @@ -297,9 +306,10 @@ class DistributedDataParallel(_BaseDataParallel): self._make_forward_pre_hook() ) - def disable_forward_pre_hook(self): + def disable_forward_pre_hook(self, param_sync: bool = True): """ Disable forward pre-hooks needed for param all-gather overlap with forward compute. + Skip synchronous param all-gather if `param_sync` is False. """ assert self.use_forward_hook # De-register forward pre-hook for all sub-modules. @@ -310,7 +320,8 @@ class DistributedDataParallel(_BaseDataParallel): assert len(self.remove_forward_pre_hook_handles) == 0 # Force synchronize parameters. - self.start_param_sync(force_sync=True) + if param_sync: + self.start_param_sync(force_sync=True) def _make_forward_pre_hook(self): """ @@ -323,6 +334,9 @@ class DistributedDataParallel(_BaseDataParallel): self.use_forward_hook ), "Should use pre-hook only when overlap_param_gather is True" + if is_graph_capturing(): + return + # Make sure all parameters in this module have been all-gathered as necessary. for param in module.parameters(recurse=False): # Skip parameters without an associated buffer (such parameters have a @@ -353,6 +367,9 @@ class DistributedDataParallel(_BaseDataParallel): """ def hook(*unused): + if is_graph_capturing(): + return + if param in self.param_to_bucket_group: assert param.requires_grad if self.ddp_config.overlap_grad_reduce: diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py old mode 100755 new mode 100644 index 00c8fdd69db447a71518792d10c926c45a17c795..5095a7c7f3b44f8a7040d3263873aebd2a76b681 --- a/megatron/core/distributed/param_and_grad_buffer.py +++ b/megatron/core/distributed/param_and_grad_buffer.py @@ -270,13 +270,12 @@ class _ParamAndGradBucketGroup: if self.ddp_config.average_in_collective: reduce_op = torch.distributed.ReduceOp.AVG - # Stream synchronization logic of the CUDA streams that is - # implemented below for the gradient reduction within and across - # distributed optimizer instances. + # We use the following stream synchronization for the gradient reduction + # within and across DistOpt instances. - # Compute Stream - -------------Gradient Compute------------------- - # Comm. Stream - ------(wait for nccl)-----(wait for nccl)------- - # NCCL Stream - -------RS------ -------AR------ + # Compute Stream: -------------Gradient compute------------------- + # Comm. Stream: ------(wait for NCCL)-----(wait for NCCL)------- + # NCCL Stream: -------RS------ -------AR------ # Use async communications only when overlap_grad_reduce is True. async_op = ( @@ -287,13 +286,13 @@ class _ParamAndGradBucketGroup: self.ddp_config.num_distributed_optimizer_instances > 1 and self.ddp_config.overlap_grad_reduce ): - # Assign a communication stream if we use partial DP DistOpt and we - # need to overlap communication + # Assign a communication stream if we have multiple DistOpt instances and we + # need to overlap communication. stream_context = torch.cuda.stream(self.communication_stream) # The RS/AR communication stream needs to wait for the default stream # to complete its gradient computation before launching the next - # gradient reduction collective + # gradient reduction collective. self.communication_stream.wait_stream(torch.cuda.default_stream()) else: stream_context = nullcontext() @@ -314,24 +313,21 @@ class _ParamAndGradBucketGroup: local_data_view, bucket.grad_data, op=reduce_op, - group=self.intra_distributed_optimizer_instance_group, + group=communication_group, async_op=async_op, ) else: torch.distributed.all_reduce( - bucket.grad_data, - op=reduce_op, - group=self.data_parallel_group, - async_op=async_op, + bucket.grad_data, op=reduce_op, group=communication_group, async_op=async_op ) - # When enabling partial DP domain DistOpt, we need to All-Reduce across all partial domains + # With multiple DistOpt instances, we need to all-reduce across instances. if ( self.ddp_config.use_distributed_optimizer and self.ddp_config.num_distributed_optimizer_instances > 1 ): - # Create a new coalescing facility for the inter partial DP-AllReduce here + # Create a new coalescing manager for the inter-instance all-reduce. with stream_context, _coalescing_manager( self.inter_distributed_optimizer_instance_group, async_ops=async_op ) as cm: @@ -366,13 +362,13 @@ class _ParamAndGradBucketGroup: communication call to complete. When ddp_config.overlap_grad_reduce is set to False, makes synchronous call. """ - # If overlap_grad_reduce is False, start (and finish) synchronous communication call here. self.param_gather_dispatched = False + # If overlap_grad_reduce is False, start (and finish) synchronous communication call here. if not self.ddp_config.overlap_grad_reduce: self.start_grad_sync() return - # When using partial DP DistOpt, we don't need to sync as we launch comms on a separate - # communication stream + # When using multiple DistOpt instances, we don't need to sync here as we launch + # communications on a separate communication stream. if self.ddp_config.num_distributed_optimizer_instances > 1: torch.cuda.default_stream().wait_stream(self.communication_stream) return diff --git a/megatron/core/distributed/torch_fully_sharded_data_parallel.py b/megatron/core/distributed/torch_fully_sharded_data_parallel.py old mode 100755 new mode 100644 diff --git a/megatron/core/enums.py b/megatron/core/enums.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/__init__.py b/megatron/core/export/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/data_type.py b/megatron/core/export/data_type.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/export_config.py b/megatron/core/export/export_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/model_type.py b/megatron/core/export/model_type.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/__init__.py b/megatron/core/export/trtllm/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/engine_builder/__init__.py b/megatron/core/export/trtllm/engine_builder/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trt_model_config.py b/megatron/core/export/trtllm/trt_model_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trt_model_type.py b/megatron/core/export/trtllm/trt_model_type.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_helper.py b/megatron/core/export/trtllm/trtllm_helper.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_layers.py b/megatron/core/export/trtllm/trtllm_layers.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py old mode 100755 new mode 100644 diff --git a/megatron/core/extensions/__init__.py b/megatron/core/extensions/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py old mode 100755 new mode 100644 index 62336cdb034919241112baa10c6407cb00506892..a89e272e51ef9a12b1421c3054f40f95f3c53d15 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -13,8 +13,8 @@ from packaging.version import Version as PkgVersion from torch import Tensor from torch.nn.parameter import Parameter -from megatron.core import ModelParallelConfig from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding +from megatron.core.model_parallel_config import ModelParallelConfig from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.parallel_state import ( get_context_parallel_global_ranks, @@ -654,6 +654,23 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): else: kv_channels = self.config.kv_channels + self.kept_packed_seq_params = set( + field.name for field in dataclasses.fields(PackedSeqParams) + ) + if get_te_version() < PkgVersion("1.3.0"): + # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H + # copies (#555) + # These two arguments did not exist prior to 1.3.0 + self.kept_packed_seq_params.discard("max_seqlen_q") + self.kept_packed_seq_params.discard("max_seqlen_kv") + + if get_te_version() < PkgVersion("1.10.0"): + # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted + # in each individual sequence in THD format dataset + # These two arguments did not exist prior to 1.8.0. Full support added in 1.10.0 (#1012) + self.kept_packed_seq_params.discard("cu_seqlens_q_padded") + self.kept_packed_seq_params.discard("cu_seqlens_kv_padded") + super().__init__( num_attention_heads=self.config.num_attention_heads, kv_channels=kv_channels, @@ -683,7 +700,9 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): ): """Forward.""" packed_seq_kwargs = ( - dataclasses.asdict(packed_seq_params) if packed_seq_params is not None else {} + {key: getattr(packed_seq_params, key) for key in self.kept_packed_seq_params} + if packed_seq_params is not None + else {} ) # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set # after init @@ -692,24 +711,10 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format) - if get_te_version() < PkgVersion("1.3.0"): - # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H - # copies (#555) - # These two arguments did not exist prior to 1.3.0 - packed_seq_kwargs.pop("max_seqlen_q", None) - packed_seq_kwargs.pop("max_seqlen_kv", None) - - if get_te_version() < PkgVersion("1.10.0"): - # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted - # in each individual sequence in THD format dataset - # These two arguments did not exist prior to 1.8.0.Full support added in 1.10.0 (#1012) - packed_seq_kwargs.pop("cu_seqlens_q_padded", None) - packed_seq_kwargs.pop("cu_seqlens_kv_padded", None) - # WAR for peak memory usage. # See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2388 if self.config.apply_rope_fusion and qkv_format == 'bshd': - query, key, value = [x.contiguous().transpose(0, 1) for x in (query, key, value)] + query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)] # In PyTorch, the following two tensors are in fact the same: # Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1) # Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1) @@ -760,7 +765,7 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): if is_te_min_version("1.9.0.dev0"): - class TEGroupedLinear(te.pytorch.GroupedLinear): + class TEGroupedLinear(te.pytorch.BatchLinear if int(os.getenv("GROUPED_GEMM_BatchLinear", '0')) else te.pytorch.GroupedLinear): """ Wrapper for the Transformer-Engine's `GroupedLinear` layer. @@ -1229,8 +1234,14 @@ try: from transformer_engine.pytorch.attention import FusedRoPEFunc - def fused_apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: + def fused_apply_rotary_pos_emb( + t: torch.Tensor, freqs: torch.Tensor, transpose_output_memory: bool = False + ) -> torch.Tensor: """Apply rotary positional embedding to input tensor T in `sbhd` format.""" + if transpose_output_memory: + warnings.warn( + "transpose_output_memory is not supported by TE's fused RoPE and will be ignored." + ) return FusedRoPEFunc.apply(t, freqs, "sbhd") def fused_apply_rotary_pos_emb_thd( diff --git a/megatron/core/fusions/__init__.py b/megatron/core/fusions/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_bias_dropout.py b/megatron/core/fusions/fused_bias_dropout.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_bias_geglu.py b/megatron/core/fusions/fused_bias_geglu.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_bias_gelu.py b/megatron/core/fusions/fused_bias_gelu.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_bias_swiglu.py b/megatron/core/fusions/fused_bias_swiglu.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_cross_entropy.py b/megatron/core/fusions/fused_cross_entropy.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_layer_norm.py b/megatron/core/fusions/fused_layer_norm.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_softmax.py b/megatron/core/fusions/fused_softmax.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/__init__.py b/megatron/core/inference/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/ammo_support/__init__.py b/megatron/core/inference/ammo_support/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/ammo_support/gpt/model_specs.py b/megatron/core/inference/ammo_support/gpt/model_specs.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py b/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/common_inference_params.py b/megatron/core/inference/common_inference_params.py old mode 100755 new mode 100644 index 22353088f8adbda6f331cce26a7a1297aa7faccd..7955bb6fc143bf5daf754d20817b23e1d0ab5d72 --- a/megatron/core/inference/common_inference_params.py +++ b/megatron/core/inference/common_inference_params.py @@ -1,29 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -from dataclasses import dataclass - - -@dataclass -class CommonInferenceParams: - """Inference parameters sent along with the prompts - - For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910 - """ - - temperature: float = 1.0 - top_k: int = 0 - top_p: float = 0.0 - return_log_probs: bool = False - num_tokens_to_generate: int = 30 - - def add_attributes(self, attribute_value_pair: dict): - """Utility to add more attributes to inference params - - Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows - c = CommonInferenceParams - c.add_attributes({'min_length':4, 'eod_id':153}) - - Args: - attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values. - """ - for key, value in attribute_value_pair.items(): - setattr(self, key, value) +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from megatron.core.inference.sampling_params import ( # noqa: F401 # pylint: disable=unused-import + SamplingParams as CommonInferenceParams, +) diff --git a/megatron/core/inference/communication_utils.py b/megatron/core/inference/communication_utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/engines/__init__.py b/megatron/core/inference/engines/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/engines/abstract_engine.py b/megatron/core/inference/engines/abstract_engine.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/engines/mcore_engine.py b/megatron/core/inference/engines/mcore_engine.py old mode 100755 new mode 100644 index fe8160228bdb6dd7e1bc966273c677ca50e7ae4f..28ef46bf9292999ee68028d98c41ff8e8aa74f43 --- a/megatron/core/inference/engines/mcore_engine.py +++ b/megatron/core/inference/engines/mcore_engine.py @@ -3,12 +3,12 @@ from typing import Dict, List import torch -from megatron.core.inference.common_inference_params import CommonInferenceParams from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.scheduler import Scheduler -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( - SimpleTextGenerationController, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, ) @@ -19,7 +19,7 @@ class MCoreEngine(AbstractEngine): Supports any model that is callable (Accepts the inputs and outputs the tensor) Args: - text_generation_controller (SimpleTextGenerationController): A text generation + text_generation_controller (TextGenerationController): A text generation controller that will be used to define how to preprocess prompts, generate outputs and detokenizer the output tokens. max_batch_size : The maxinum number of requests to process at once @@ -29,7 +29,7 @@ class MCoreEngine(AbstractEngine): def __init__( self, - text_generation_controller: SimpleTextGenerationController, + text_generation_controller: TextGenerationController, max_batch_size, random_seed: int = None, ): @@ -42,7 +42,8 @@ class MCoreEngine(AbstractEngine): prompts: List[str], add_BOS: bool = False, encoder_prompts: List[str] = None, - common_inference_params: CommonInferenceParams = None, + common_inference_params: SamplingParams = None, + sampling_params: SamplingParams = None, ) -> dict: """The megatron core inference backend generate function @@ -54,13 +55,19 @@ class MCoreEngine(AbstractEngine): prompts (List[str]): All the prompts as a list of strings add_BOS (bool): Whether to add BOS token to beginning of prompts encoder_prompts (List[dict]): All the encoder prompts as a list of strings - common_inference_params (CommonInferenceParams): The inference parameters + common_inference_params: Deprecated. Only used for backward compatibility with + MCore <= 0.9.0. Use `sampling_params` going forward. + sampling_params (SamplingParams): The request-level sampling parameters Returns: List[InferenceRequest]: The output is list of inference requests containing the generated tokens, texts and log probs if required """ # TODO :M core- get rng state tracker + + if common_inference_params: + sampling_params = common_inference_params + if self.random_seed: torch.random.manual_seed(self.random_seed) @@ -73,7 +80,7 @@ class MCoreEngine(AbstractEngine): prompt=prompt, prompt_tokens=prompt_tokens, encoder_prompt=encoder_prompt, - inference_parameters=common_inference_params, + inference_parameters=sampling_params, ) self.run_engine() diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py old mode 100755 new mode 100644 index 4825dfd3661d8b26e0cec5f003fdf1486886a2d5..ea0d67bfea26112db6219f53a3fcbec244e58ca3 --- a/megatron/core/inference/inference_request.py +++ b/megatron/core/inference/inference_request.py @@ -5,7 +5,7 @@ from typing import List import torch -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams # class syntax @@ -28,7 +28,7 @@ class InferenceRequest: request_id: str prompt: str - inference_parameters: CommonInferenceParams + inference_parameters: SamplingParams prompt_tokens: List[int] arrival_time: float status: Status diff --git a/megatron/core/inference/model_inference_wrappers/__init__.py b/megatron/core/inference/model_inference_wrappers/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/gpt/__init__.py b/megatron/core/inference/model_inference_wrappers/gpt/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/t5/__init__.py b/megatron/core/inference/model_inference_wrappers/t5/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/modelopt_support/__init__.py b/megatron/core/inference/modelopt_support/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/modelopt_support/gpt/__init__.py b/megatron/core/inference/modelopt_support/gpt/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/modelopt_support/gpt/model_specs.py b/megatron/core/inference/modelopt_support/gpt/model_specs.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py b/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/sampling_params.py b/megatron/core/inference/sampling_params.py new file mode 100644 index 0000000000000000000000000000000000000000..8ffcb6321dc13cb04574accaa0636987454222fa --- /dev/null +++ b/megatron/core/inference/sampling_params.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from dataclasses import dataclass + + +@dataclass +class SamplingParams: + """Inference parameters sent along with the prompts. + This class contains request-level attributes that control the sampling techniques used when + generating text. This is distinct from megatron.core.InferenceParams, which is sets model-level + inference attributes such as the maximum sequence length, and contains the KV cache. + + For an explanation of these parameters refer to this blog + https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and- + temperature-parameters-ed6a31313910 + """ + + temperature: float = 1.0 + top_k: int = 0 + top_p: float = 0.0 + return_log_probs: bool = False + num_tokens_to_generate: int = 30 + + def add_attributes(self, attribute_value_pair: dict): + """Utility to add more attributes to sampling params + + Use this method to pass in a custom dictionary to add more sampling parameter attributes. + c = SamplingParams + c.add_attributes({'min_length':4, 'eod_id':153}) + + Args: + attribute_value_pair (dict): A dictionary containing attributes as the key names and + their values as the values. + """ + for key, value in attribute_value_pair.items(): + setattr(self, key, value) diff --git a/megatron/core/inference/scheduler.py b/megatron/core/inference/scheduler.py old mode 100755 new mode 100644 index 00ab81b4abdb055e38e61b967c58509f9d773536..ef177232b42419fdae7284155701e403368561a6 --- a/megatron/core/inference/scheduler.py +++ b/megatron/core/inference/scheduler.py @@ -6,8 +6,8 @@ from typing import Dict import torch -from megatron.core.inference.common_inference_params import CommonInferenceParams from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.utils import Counter @@ -33,7 +33,7 @@ class Scheduler: prompt: str, prompt_tokens: torch.Tensor, encoder_prompt: str = None, - inference_parameters: CommonInferenceParams = None, + inference_parameters: SamplingParams = None, arrival_time: float = None, ): """Add an incoming request @@ -45,7 +45,7 @@ class Scheduler: prompt (str): Input prompt string prompt_tokens (torch.Tensor): A torch tensor having the input prompts tokenized encoder_prompt (str): Encoder input string - inference_parameters (CommonInferenceParams): The inference parameters + inference_parameters (SamplingParams): The inference parameters arrival_time (float, optional): The incoming request time. Defaults to None. """ request_id = str(next(self.request_counter)) diff --git a/megatron/core/inference/text_generation_controllers/__init__.py b/megatron/core/inference/text_generation_controllers/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py old mode 100755 new mode 100644 index 61beff0211ce25bfa5c207d7cf95c170ae0956e4..0c2a41be44a10228dce66541e0f93559e691f288 --- a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py @@ -4,15 +4,15 @@ from typing import OrderedDict import torch from megatron.core.inference.inference_request import InferenceRequest -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( - SimpleTextGenerationController, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, ) -class EncoderDecoderTextGenerationController(SimpleTextGenerationController): +class EncoderDecoderTextGenerationController(TextGenerationController): """The text generation controller for encoder-decoder architecture - This class ingherits from SimpleTextGenerationController, adding features + This class inherits from TextGenerationController, adding features relating to encoder input encoder_prompt """ diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py old mode 100755 new mode 100644 index 1103089935ab01ee885f2264790a58c5a93bac64..f97df132493416b0f53c267b0c9088ef7f668a0d --- a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py @@ -1,400 +1,5 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -from typing import List, OrderedDict, Tuple +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -import torch -import torch.nn.functional as F - -from megatron.core import parallel_state -from megatron.core.inference.common_inference_params import CommonInferenceParams -from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage -from megatron.core.inference.inference_request import InferenceRequest, Status -from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( - AbstractModelInferenceWrapper, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( # noqa: F401 # pylint: disable=unused-import + TextGenerationController as SimpleTextGenerationController, ) - - -class SimpleTextGenerationController: - """The basic text generation controller - - This class is responsible for tokenizing the input , running the inference, sampling - and also detokenizing the output - - Args: - inference_wrapped_model (AbstractModelInferenceWrapper): A model that - is wrapped using the specs given in the abstract_model_inference_wrapper.py - tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts - """ - - def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): - self.inference_wrapped_model = inference_wrapped_model - self.tokenizer = tokenizer - - # For models without pipeline parallelism, is_first_stage and is_last_stage returns True - self.model_is_pipeline_parallel = not ( - parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() - ) - - def tokenize_prompt( - self, prompt: str, add_BOS: bool = False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Utility to tokenize the input prompts - - Args: - prompt (str): The input prompt - - Returns: - torch.Tensor: Returns the tokenized prompt - """ - prompt_tokens = self.tokenizer.tokenize(prompt) - - if add_BOS: - prompt_tokens = [self.tokenizer.bos] + prompt_tokens - - return prompt_tokens - - def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str: - """Detokenize the output generations - - Args: - prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt - tokens plus the generated tokens - - Returns: - str: The detokenized output - """ - tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist() - return self.tokenizer.detokenize(tokens) - - def sample_from_logits( - self, - last_token_logits: torch.Tensor, - common_inference_params: CommonInferenceParams, - vocab_size: int = None, - ) -> torch.Tensor: - """Samples the logits to generate outputs - - Given the logits of the last token, this function samples it - according to the parameters defined in common_inference_params - and returns the samples - - Args: - last_token_logits (torch.Tensor): The last token logits. A tensor of - size [batch_size, vocab_size] - common_inference_params (CommonInferenceParams): The paramters to use - for inference - vocab_size (int): Obtained from the tokenizer. Defaults to None - - Returns: - torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements - """ - - top_p = common_inference_params.top_p - top_k = common_inference_params.top_k - temperature = common_inference_params.temperature - - assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero' - assert top_p <= 1.0, 'top-p should be in (0,1]' - - def modify_logits_for_top_k_filtering(logits, top_k): - """Set the logits for none top-k values to -inf.""" - filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None] - logits.masked_fill_(filter_, float('-Inf')) - - def modify_logits_for_top_p_filtering(logits, top_p): - """Set the logits for none top-p values to -inf.""" - # First sort and calculate cumulative sum of probabilities. - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) - - # Filteration based on the cumulative sum. - filter_ = cumulative_probs > top_p - # This shift by 1 is weird and I cannot justify it. This existed - # in the original implementation: - # https://github.com/ari-holtzman/degen/blob/master/gen.py - # and I guess it is needed so keeping it for now. - filter_[:, 1:] = filter_[:, :-1].clone() - # Make sure we at least have one token to select from. - filter_[..., 0] = 0 - - # Fill in the filtered part - filter_ = filter_.scatter(1, sorted_indices, filter_) - logits.masked_fill_(filter_, float('-Inf')) - - # Greedy sampling - if top_k == 1: - sampled_logits = torch.argmax(last_token_logits, dim=-1) - else: - last_token_logits = last_token_logits.clone() - if temperature != 1.0: - last_token_logits.div_(temperature) - - if top_k > 1: - assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.' - if vocab_size: - assert top_k < vocab_size, 'top-k is larger than vocab size.' - modify_logits_for_top_k_filtering(last_token_logits, top_k) - - elif top_p > 0.0: - modify_logits_for_top_p_filtering(last_token_logits, top_p) - - # After filtering, we need to recalculate the distribution. - probabilities = last_token_logits.softmax(dim=-1) - sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1) - - # If vocab size is provided, make sure the samples are in in the range [0, vocab-size). - if vocab_size: - sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1)) - return sampled_logits - - def update_generation_status( - self, - updated_prompts_tokens: torch.Tensor, - generation_started: torch.Tensor, - current_context_end_position: int, - is_generation_done_tensor: torch.Tensor, - generated_sequence_lengths: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Checks which prompts have reached an end condition - - We check which prompts have reached an end condition and set the corresponding - flags of the is_generation_done_tensor to True. The generated sequence lengths - increase as we keep generating, until that prompts hits an end condition. The - generation_started tensor determines which prompts have started generating. - - Args: - updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest - generated tokens. A tensor of shape [batch_size, max_seq_len] - (i.e max_seq_len = max_prompt_len + tokens_to_generate) - generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True - indicates the prompt at that index has started generating tokens. - current_context_end_position (int): An integer indicating which position to - extract from the prompts tokens to get the latest generated tokens. - is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size]. - True indicates the prompt at that index has reached end condition. - generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size]. - Each value represents the generated sequence lengths for that prompt. - - Returns: - Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean - is_generation_done_tensor and the generated_sequence_lengths after updating it - """ - latest_samples = updated_prompts_tokens[:, current_context_end_position] - # Make sure we are checking eod criterion only for prompts that have started generating - # (i.e) We only look at the generated tokenns and not the input tokens. - reached_eod = (latest_samples == self.tokenizer.eod) & generation_started - is_generation_done_tensor = is_generation_done_tensor | reached_eod - # We increment generated sequence lengths when that prompt has not hit the - # EOD and generation has started - generated_sequence_lengths += ~is_generation_done_tensor & generation_started - - return is_generation_done_tensor, generated_sequence_lengths - - def pad_input_prompt_tokens( - self, - batch_prompt_tokens_list: List[List[int]], - max_prompt_length_in_batch: int, - num_tokens_to_generate: int, - ) -> torch.Tensor: - """Method to pad input prompts - - Given a list of prompts, pad them all to uniform length - - Args: - batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens - max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens - num_tokens_togenerate (int): The number of tokens to generate for each prompt - - Returns: - torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e) - max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate, - with extra indices for each tensor padded with mask id. - """ - max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate - - for prompt_tokens in batch_prompt_tokens_list: - padding_size = max_seq_len - len(prompt_tokens) - prompt_tokens.extend([self.tokenizer.eod] * padding_size) - - return torch.tensor(batch_prompt_tokens_list).cuda() - - def generate_output_tokens_dynamic_batch( - self, active_requests: OrderedDict[int, InferenceRequest] - ) -> OrderedDict[int, InferenceRequest]: - """Utility to generate the output tokens and probabilities for the prompts - - This utility generates the output tokens for a dynamic batch. It will run one forward step - at a time, and pass control back to the engine, which will update the request pool and call - this method again. - - Args: - active_requests (OrderedDict[int, InferenceRequest]): The input active requests. - - Returns: - OrderedDict[int, InferenceRequest]: The result for each of the incoming requests - after running one forward step. - """ - raise Exception("Not implemented yet") - - def generate_all_output_tokens_static_batch( - self, active_requests: OrderedDict[int, InferenceRequest] - ) -> OrderedDict[int, InferenceRequest]: - """Utility to generate the all the output tokens and probabilities for the prompts . - - This utility generates the output tokens for a static batch. It runs the forward steps till - all prompts complete generation, updates the status of these requests to completed, adds - the generated result and returns these requests - - Args: - active_requests (OrderedDict[int, InferenceRequest]): The input active requests. - - Returns: - OrderedDict[int, InferenceRequest]: The result for each of the incoming requests - """ - batch_prompt_tokens_list = list( - map(lambda request: request.prompt_tokens, active_requests.values()) - ) - prompt_lengths_in_batch = torch.tensor( - [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list] - ).cuda() - max_prompt_length_in_batch = max(prompt_lengths_in_batch) - min_prompt_length_in_batch = min(prompt_lengths_in_batch) - - # For batch inference the inference params are the same for all request - common_inference_params: CommonInferenceParams = list(active_requests.values())[ - 0 - ].inference_parameters - - # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate - batch_prompt_tokens = self.pad_input_prompt_tokens( - batch_prompt_tokens_list, - max_prompt_length_in_batch=max_prompt_length_in_batch, - num_tokens_to_generate=common_inference_params.num_tokens_to_generate, - ) - batch_size, max_sequence_length = batch_prompt_tokens.shape - - # Pre allocate log probs tensor - output_log_probs = None - if common_inference_params.return_log_probs: - output_log_probs = torch.empty( - (batch_size, max_sequence_length - 1), dtype=torch.float32 - ).cuda() - - # An array to check which of the prompts have reached end of generation condition - is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda() - - # An array to act as a counter to keep track of generated sequence lengths - generated_sequence_lengths = torch.zeros(batch_size).cuda() - - with torch.no_grad(): - - self.prep_model_for_inference( - prompts_tokens=batch_prompt_tokens, active_requests=active_requests - ) - - context_start_position = 0 - # Pick the context window that we need to pass through the network. - for context_end_position in range(min_prompt_length_in_batch, max_sequence_length): - - inference_input = self.inference_wrapped_model.get_batch_for_context_window( - context_start_position, context_end_position - ) - - # Returns the final logits of shape [batch_size, context_length, vocab_size] - # Note: This is returned in all TP ranks or last PP stage in PP models - logits = self.inference_wrapped_model.run_one_forward_step(inference_input) - if self.model_is_pipeline_parallel: - context_length = context_end_position - context_start_position - logits = broadcast_from_last_pipeline_stage( - [batch_size, context_length, self.tokenizer.vocab_size], - dtype=self.inference_wrapped_model.inference_wrapper_config.params_dtype, - tensor=logits, - ) - - # Indicates which of the input prompts have started generating tokens. - # A 1D boolean tensor with [batch_size] elements (i.e) The shortest - # prompts will start generating first and so on - generation_started = prompt_lengths_in_batch <= context_end_position - last_token_logits = logits[:, -1, :] - sampled_logits = self.sample_from_logits( - last_token_logits, common_inference_params, self.tokenizer.vocab_size - ) - - # Substitute the sampled logits only for only the prompts that - # have started generating tokens - batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[ - generation_started - ] - - if common_inference_params.return_log_probs: - log_probs = F.log_softmax(logits, dim=2) - indices = torch.unsqueeze( - batch_prompt_tokens[ - :, (context_start_position + 1) : (context_end_position + 1) - ], - 2, - ) - # Get the log probabilities for only the prompt tokens - output_log_probs[:, context_start_position:context_end_position] = torch.gather( - log_probs, 2, indices - ).squeeze(2) - - context_start_position = context_end_position - - # Check end of generation status for each tensor - # and update generated sequence lengths - (is_generation_done_tensor, generated_sequence_lengths) = ( - self.update_generation_status( - updated_prompts_tokens=batch_prompt_tokens, - generation_started=generation_started, - current_context_end_position=context_end_position, - is_generation_done_tensor=is_generation_done_tensor, - generated_sequence_lengths=generated_sequence_lengths, - ) - ) - # Boolean flag indicating if all prompts are finished - all_prompts_done = torch.all(is_generation_done_tensor) - if all_prompts_done: - break - - # Include all the generated tokens - batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)] - if common_inference_params.return_log_probs: - output_log_probs = output_log_probs[:, :context_end_position] - - generated_sequence_lengths[ - generated_sequence_lengths > common_inference_params.num_tokens_to_generate - ] = common_inference_params.num_tokens_to_generate - - for idx, request in enumerate(active_requests.values()): - input_prompt_length = int(prompt_lengths_in_batch[idx]) - # Shorter prompts might have generated more than required tokens. So we trim them down - required_sequence_length = int( - min(generated_sequence_lengths[idx], common_inference_params.num_tokens_to_generate) - ) - # Extract only the generated tokens - required_result_tokens = batch_prompt_tokens_with_generations[ - idx, input_prompt_length : (input_prompt_length + required_sequence_length) - ] - - request.generated_length = required_sequence_length - request.generated_tokens = required_result_tokens - request.generated_log_probs = ( - None - if output_log_probs is None - else output_log_probs[idx, input_prompt_length:required_sequence_length] - ) - request.status = Status.COMPLETED - request.generated_text = self.detokenize_generations(required_result_tokens) - - return active_requests - - def prep_model_for_inference( - self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest] - ): - """Preparing batch for inference, using respective wrapper's prep_model_for_inference method - - Args: - prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length] - active_requests (OrderedDict[int, InferenceRequest]): The input active requests - """ - self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens) diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py new file mode 100644 index 0000000000000000000000000000000000000000..f15c819c43a5f824224b958e78d2359260a18640 --- /dev/null +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -0,0 +1,400 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from typing import List, OrderedDict, Tuple + +import torch +import torch.nn.functional as F + +from megatron.core import parallel_state +from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage +from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( + AbstractModelInferenceWrapper, +) +from megatron.core.inference.sampling_params import SamplingParams + + +class TextGenerationController: + """The text generation controller (the main sampling loop) + + This class tokenizes the input, runs inference, samples from logits, and detokenizes the output. + + Args: + inference_wrapped_model (AbstractModelInferenceWrapper): A model that + is wrapped using the specs given in the abstract_model_inference_wrapper.py + tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts + """ + + def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): + self.inference_wrapped_model = inference_wrapped_model + self.tokenizer = tokenizer + + # For models without pipeline parallelism, is_first_stage and is_last_stage returns True + self.model_is_pipeline_parallel = not ( + parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() + ) + + def tokenize_prompt( + self, prompt: str, add_BOS: bool = False + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Utility to tokenize the input prompts + + Args: + prompt (str): The input prompt + + Returns: + torch.Tensor: Returns the tokenized prompt + """ + prompt_tokens = self.tokenizer.tokenize(prompt) + + if add_BOS: + prompt_tokens = [self.tokenizer.bos] + prompt_tokens + + return prompt_tokens + + def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str: + """Detokenize the output generations + + Args: + prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt + tokens plus the generated tokens + + Returns: + str: The detokenized output + """ + tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist() + return self.tokenizer.detokenize(tokens) + + def sample_from_logits( + self, + last_token_logits: torch.Tensor, + sampling_params: SamplingParams = None, + vocab_size: int = None, + **kwargs + ) -> torch.Tensor: + """Samples the logits to generate outputs + + Given the logits of the last token, this function samples it + according to the parameters defined in sampling_params + and returns the samples + + Args: + last_token_logits (torch.Tensor): The last token logits. A tensor of + size [batch_size, vocab_size] + sampling_params (SamplingParams): The parameters to use for inference. + vocab_size (int): Obtained from the tokenizer. Defaults to None + + Returns: + torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements + """ + + if kwargs.get('common_inference_params'): + sampling_params = kwargs['common_inference_params'] + + top_p = sampling_params.top_p + top_k = sampling_params.top_k + temperature = sampling_params.temperature + + assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero' + assert top_p <= 1.0, 'top-p should be in (0,1]' + + def modify_logits_for_top_k_filtering(logits, top_k): + """Set the logits for none top-k values to -inf.""" + filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits.masked_fill_(filter_, float('-Inf')) + + def modify_logits_for_top_p_filtering(logits, top_p): + """Set the logits for none top-p values to -inf.""" + # First sort and calculate cumulative sum of probabilities. + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Filteration based on the cumulative sum. + filter_ = cumulative_probs > top_p + # This shift by 1 is weird and I cannot justify it. This existed + # in the original implementation: + # https://github.com/ari-holtzman/degen/blob/master/gen.py + # and I guess it is needed so keeping it for now. + filter_[:, 1:] = filter_[:, :-1].clone() + # Make sure we at least have one token to select from. + filter_[..., 0] = 0 + + # Fill in the filtered part + filter_ = filter_.scatter(1, sorted_indices, filter_) + logits.masked_fill_(filter_, float('-Inf')) + + # Greedy sampling + if top_k == 1: + sampled_logits = torch.argmax(last_token_logits, dim=-1) + else: + last_token_logits = last_token_logits.clone() + if temperature != 1.0: + last_token_logits.div_(temperature) + + if top_k > 1: + assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.' + if vocab_size: + assert top_k < vocab_size, 'top-k is larger than vocab size.' + modify_logits_for_top_k_filtering(last_token_logits, top_k) + + elif top_p > 0.0: + modify_logits_for_top_p_filtering(last_token_logits, top_p) + + # After filtering, we need to recalculate the distribution. + probabilities = last_token_logits.softmax(dim=-1) + sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1) + + # If vocab size is provided, make sure the samples are in in the range [0, vocab-size). + if vocab_size: + sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1)) + return sampled_logits + + def update_generation_status( + self, + updated_prompts_tokens: torch.Tensor, + generation_started: torch.Tensor, + current_context_end_position: int, + is_generation_done_tensor: torch.Tensor, + generated_sequence_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Checks which prompts have reached an end condition + + We check which prompts have reached an end condition and set the corresponding + flags of the is_generation_done_tensor to True. The generated sequence lengths + increase as we keep generating, until that prompts hits an end condition. The + generation_started tensor determines which prompts have started generating. + + Args: + updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest + generated tokens. A tensor of shape [batch_size, max_seq_len] + (i.e max_seq_len = max_prompt_len + tokens_to_generate) + generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True + indicates the prompt at that index has started generating tokens. + current_context_end_position (int): An integer indicating which position to + extract from the prompts tokens to get the latest generated tokens. + is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size]. + True indicates the prompt at that index has reached end condition. + generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size]. + Each value represents the generated sequence lengths for that prompt. + + Returns: + Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean + is_generation_done_tensor and the generated_sequence_lengths after updating it + """ + latest_samples = updated_prompts_tokens[:, current_context_end_position] + # Make sure we are checking eod criterion only for prompts that have started generating + # (i.e) We only look at the generated tokenns and not the input tokens. + reached_eod = (latest_samples == self.tokenizer.eod) & generation_started + is_generation_done_tensor = is_generation_done_tensor | reached_eod + # We increment generated sequence lengths when that prompt has not hit the + # EOD and generation has started + generated_sequence_lengths += ~is_generation_done_tensor & generation_started + + return is_generation_done_tensor, generated_sequence_lengths + + def pad_input_prompt_tokens( + self, + batch_prompt_tokens_list: List[List[int]], + max_prompt_length_in_batch: int, + num_tokens_to_generate: int, + ) -> torch.Tensor: + """Method to pad input prompts + + Given a list of prompts, pad them all to uniform length + + Args: + batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens + max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens + num_tokens_togenerate (int): The number of tokens to generate for each prompt + + Returns: + torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e) + max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate, + with extra indices for each tensor padded with mask id. + """ + max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate + + for prompt_tokens in batch_prompt_tokens_list: + padding_size = max_seq_len - len(prompt_tokens) + prompt_tokens.extend([self.tokenizer.eod] * padding_size) + + return torch.tensor(batch_prompt_tokens_list).cuda() + + def generate_output_tokens_dynamic_batch( + self, active_requests: OrderedDict[int, InferenceRequest] + ) -> OrderedDict[int, InferenceRequest]: + """Utility to generate the output tokens and probabilities for the prompts + + This utility generates the output tokens for a dynamic batch. It will run one forward step + at a time, and pass control back to the engine, which will update the request pool and call + this method again. + + Args: + active_requests (OrderedDict[int, InferenceRequest]): The input active requests. + + Returns: + OrderedDict[int, InferenceRequest]: The result for each of the incoming requests + after running one forward step. + """ + raise Exception("Not implemented yet") + + def generate_all_output_tokens_static_batch( + self, active_requests: OrderedDict[int, InferenceRequest] + ) -> OrderedDict[int, InferenceRequest]: + """Utility to generate the all the output tokens and probabilities for the prompts . + + This utility generates the output tokens for a static batch. It runs the forward steps till + all prompts complete generation, updates the status of these requests to completed, adds + the generated result and returns these requests + + Args: + active_requests (OrderedDict[int, InferenceRequest]): The input active requests. + + Returns: + OrderedDict[int, InferenceRequest]: The result for each of the incoming requests + """ + batch_prompt_tokens_list = list( + map(lambda request: request.prompt_tokens, active_requests.values()) + ) + prompt_lengths_in_batch = torch.tensor( + [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list] + ).cuda() + max_prompt_length_in_batch = max(prompt_lengths_in_batch) + min_prompt_length_in_batch = min(prompt_lengths_in_batch) + + # For batch inference the inference params are the same for all request + sampling_params: SamplingParams = list(active_requests.values())[0].inference_parameters + + # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate + batch_prompt_tokens = self.pad_input_prompt_tokens( + batch_prompt_tokens_list, + max_prompt_length_in_batch=max_prompt_length_in_batch, + num_tokens_to_generate=sampling_params.num_tokens_to_generate, + ) + batch_size, max_sequence_length = batch_prompt_tokens.shape + + # Pre allocate log probs tensor + output_log_probs = None + if sampling_params.return_log_probs: + output_log_probs = torch.empty( + (batch_size, max_sequence_length - 1), dtype=torch.float32 + ).cuda() + + # An array to check which of the prompts have reached end of generation condition + is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda() + + # An array to act as a counter to keep track of generated sequence lengths + generated_sequence_lengths = torch.zeros(batch_size).cuda() + + with torch.no_grad(): + + self.prep_model_for_inference( + prompts_tokens=batch_prompt_tokens, active_requests=active_requests + ) + + context_start_position = 0 + # Pick the context window that we need to pass through the network. + for context_end_position in range(min_prompt_length_in_batch, max_sequence_length): + + inference_input = self.inference_wrapped_model.get_batch_for_context_window( + context_start_position, context_end_position + ) + + # Returns the final logits of shape [batch_size, context_length, vocab_size] + # Note: This is returned in all TP ranks or last PP stage in PP models + logits = self.inference_wrapped_model.run_one_forward_step(inference_input) + if self.model_is_pipeline_parallel: + context_length = context_end_position - context_start_position + logits = broadcast_from_last_pipeline_stage( + [batch_size, context_length, self.tokenizer.vocab_size], + dtype=self.inference_wrapped_model.inference_wrapper_config.params_dtype, + tensor=logits, + ) + + # Indicates which of the input prompts have started generating tokens. + # A 1D boolean tensor with [batch_size] elements (i.e) The shortest + # prompts will start generating first and so on + generation_started = prompt_lengths_in_batch <= context_end_position + last_token_logits = logits[:, -1, :] + sampled_logits = self.sample_from_logits( + last_token_logits, sampling_params, self.tokenizer.vocab_size + ) + + # Substitute the sampled logits only for only the prompts that + # have started generating tokens + batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[ + generation_started + ] + + if sampling_params.return_log_probs: + log_probs = F.log_softmax(logits, dim=2) + indices = torch.unsqueeze( + batch_prompt_tokens[ + :, (context_start_position + 1) : (context_end_position + 1) + ], + 2, + ) + # Get the log probabilities for only the prompt tokens + output_log_probs[:, context_start_position:context_end_position] = torch.gather( + log_probs, 2, indices + ).squeeze(2) + + context_start_position = context_end_position + + # Check end of generation status for each tensor + # and update generated sequence lengths + (is_generation_done_tensor, generated_sequence_lengths) = ( + self.update_generation_status( + updated_prompts_tokens=batch_prompt_tokens, + generation_started=generation_started, + current_context_end_position=context_end_position, + is_generation_done_tensor=is_generation_done_tensor, + generated_sequence_lengths=generated_sequence_lengths, + ) + ) + # Boolean flag indicating if all prompts are finished + all_prompts_done = torch.all(is_generation_done_tensor) + if all_prompts_done: + break + + # Include all the generated tokens + batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)] + if sampling_params.return_log_probs: + output_log_probs = output_log_probs[:, :context_end_position] + + generated_sequence_lengths[ + generated_sequence_lengths > sampling_params.num_tokens_to_generate + ] = sampling_params.num_tokens_to_generate + + for idx, request in enumerate(active_requests.values()): + input_prompt_length = int(prompt_lengths_in_batch[idx]) + # Shorter prompts might have generated more than required tokens. So we trim them down + required_sequence_length = int( + min(generated_sequence_lengths[idx], sampling_params.num_tokens_to_generate) + ) + # Extract only the generated tokens + required_result_tokens = batch_prompt_tokens_with_generations[ + idx, input_prompt_length : (input_prompt_length + required_sequence_length) + ] + + request.generated_length = required_sequence_length + request.generated_tokens = required_result_tokens + request.generated_log_probs = ( + None + if output_log_probs is None + else output_log_probs[idx, input_prompt_length:required_sequence_length] + ) + request.status = Status.COMPLETED + request.generated_text = self.detokenize_generations(required_result_tokens) + + return active_requests + + def prep_model_for_inference( + self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest] + ): + """Preparing batch for inference, using respective wrapper's prep_model_for_inference method + + Args: + prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length] + active_requests (OrderedDict[int, InferenceRequest]): The input active requests + """ + self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens) diff --git a/megatron/core/inference/utils.py b/megatron/core/inference/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference_params.py b/megatron/core/inference_params.py old mode 100755 new mode 100644 diff --git a/megatron/core/jit.py b/megatron/core/jit.py old mode 100755 new mode 100644 index 5b1dfff3e7786af920e99bff9b3491793e5a0c91..c35c41b9fa226b928e7dc35d5dcec95f2b6a6c2c --- a/megatron/core/jit.py +++ b/megatron/core/jit.py @@ -7,4 +7,18 @@ from megatron.core.utils import is_torch_min_version jit_fuser = torch.jit.script # nvFuser is deprecated in PyTorch JIT starting from 2.2 if is_torch_min_version("2.2.0a0"): - jit_fuser = torch.compile + jit_fuser = torch.compile(mode='max-autotune-no-cudagraphs') + +# Decorator to disable Torch Dynamo +# See: https://github.com/NVIDIA/TransformerEngine/issues/308 +no_torch_dynamo = lambda recursive=True: lambda func: func +if torch.__version__ >= "2": + import torch._dynamo + + if torch.__version__ >= "2.1": + no_torch_dynamo = lambda recursive=True: lambda f: torch._dynamo.disable( + f, recursive=recursive + ) + else: + # no "recursive" option in pyTorch 2.0 - it acts as if recursive was True + no_torch_dynamo = lambda recursive=True: torch._dynamo.disable diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/T5/__init__.py b/megatron/core/models/T5/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/T5/t5_model.py b/megatron/core/models/T5/t5_model.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/__init__.py b/megatron/core/models/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/bert/__init__.py b/megatron/core/models/bert/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/bert/bert_layer_specs.py b/megatron/core/models/bert/bert_layer_specs.py old mode 100755 new mode 100644 index 80893d54aca05467442705bf9a5b7c650985bec0..4edc2ed6285f3ad7b0f4ecc5c66121f804335cc8 --- a/megatron/core/models/bert/bert_layer_specs.py +++ b/megatron/core/models/bert/bert_layer_specs.py @@ -1,4 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import warnings + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules @@ -28,38 +30,60 @@ try: HAVE_APEX = True LNImpl = FusedLayerNorm except ImportError: - import warnings from megatron.core.transformer.torch_norm import WrappedTorchNorm - warnings.warn(f'Apex is not installed. Falling back to Torch Norm') + warnings.warn('Apex is not installed. Falling back to Torch Norm') LNImpl = WrappedTorchNorm -# Use this spec to use lower level Transformer Engine modules (required for fp8 training) -bert_layer_with_transformer_engine_spec = ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.padding}, - submodules=SelfAttentionSubmodules( - linear_qkv=TELayerNormColumnParallelLinear, - core_attention=TEDotProductAttention, - linear_proj=TERowParallelLinear, - q_layernorm=IdentityOp, - k_layernorm=IdentityOp, + +def get_bert_layer_with_transformer_engine_spec(): + """Use this spec to use lower-level Transformer Engine modules (required for fp8 training). + + Returns: + ModuleSpec: Module specification with TE modules + """ + if not HAVE_TE: + raise ImportError( + "Transformer Engine is not installed. Please use local Bert layer spec instead." + ) + + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.padding}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), ), - ), - self_attn_bda=get_bias_dropout_add, - mlp=ModuleSpec( - module=MLP, - submodules=MLPSubmodules( - linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear + self_attn_bda=get_bias_dropout_add, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear + ), ), + mlp_bda=get_bias_dropout_add, ), - mlp_bda=get_bias_dropout_add, - ), -) + ) + + +def __getattr__(name): + if name == 'bert_layer_with_transformer_engine_spec': + warnings.warn( + """Attribute bert_layer_specs.bert_layer_with_transformer_engine_spec is on a + deprecation track and will be removed in future releases. Please migrate to + bert_layer_specs.get_bert_layer_with_transformer_engine_spec().""" + ) + + return get_bert_layer_with_transformer_engine_spec() + # Use this spec for an implementation using only modules in megatron core bert_layer_local_spec = ModuleSpec( diff --git a/megatron/core/models/bert/bert_lm_head.py b/megatron/core/models/bert/bert_lm_head.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/bert/pooler.py b/megatron/core/models/bert/pooler.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/__init__.py b/megatron/core/models/common/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/embeddings/__init__.py b/megatron/core/models/common/embeddings/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/embeddings/language_model_embedding.py b/megatron/core/models/common/embeddings/language_model_embedding.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py old mode 100755 new mode 100644 index f1d7ad48d2b9620fce14e15de512c309cbb02da4..3dd5193ca246d3b648917fa8d33f673a2039ee33 --- a/megatron/core/models/common/embeddings/rope_utils.py +++ b/megatron/core/models/common/embeddings/rope_utils.py @@ -17,23 +17,24 @@ from megatron.core.utils import is_te_min_version logger = logging.getLogger(__name__) +# Prefer fused RoPE from Apex as we need the `transpose_output_memory` argument for the bshd trick. +# See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2469. try: - from megatron.core.extensions.transformer_engine import ( - fused_apply_rotary_pos_emb, - fused_apply_rotary_pos_emb_thd, - ) - - HAVE_APPLY_ROPE_FUSION = True + from apex.transformer.functional import fused_apply_rotary_pos_emb except ImportError: try: - from apex.transformer.functional import ( - fused_apply_rotary_pos_emb, - fused_apply_rotary_pos_emb_thd, - ) + from megatron.core.extensions.transformer_engine import fused_apply_rotary_pos_emb + except: + fused_apply_rotary_pos_emb = None + - HAVE_APPLY_ROPE_FUSION = True +try: + from megatron.core.extensions.transformer_engine import fused_apply_rotary_pos_emb_thd +except ImportError: + try: + from apex.transformer.functional import fused_apply_rotary_pos_emb_thd except ImportError: - HAVE_APPLY_ROPE_FUSION = False + fused_apply_rotary_pos_emb_thd = None try: @@ -188,8 +189,10 @@ def apply_rotary_pos_emb( if config.apply_rope_fusion: if cu_seqlens is None: - return fused_apply_rotary_pos_emb(t, freqs) + assert fused_apply_rotary_pos_emb is not None, "apply_rope_fusion is not available." + return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True) else: + assert fused_apply_rotary_pos_emb_thd is not None, "apply_rope_fusion is not available." cp_size = parallel_state.get_context_parallel_world_size() if cp_size > 1: if not is_te_min_version("1.11.0", check_equality=False): diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/language_module/__init__.py b/megatron/core/models/common/language_module/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/vision_module/__init__.py b/megatron/core/models/common/vision_module/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/vision_module/vision_module.py b/megatron/core/models/common/vision_module/vision_module.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/gpt/__init__.py b/megatron/core/models/gpt/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py old mode 100755 new mode 100644 index 749be324ed62fe9c96efcf126902d448d8313881..d0e48c190cacc27a944e9a4bc3a748e3c4570eb7 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -1,16 +1,16 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +import warnings from typing import Optional from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP, MLPSubmodules -from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules -from megatron.core.transformer.moe.shared_experts import SharedExpertMLP from megatron.core.transformer.multi_latent_attention import ( MLASelfAttention, MLASelfAttentionSubmodules, @@ -26,12 +26,10 @@ from megatron.core.utils import is_te_min_version try: from megatron.core.extensions.transformer_engine import ( - TEColumnParallelGroupedLinear, TEColumnParallelLinear, TEDotProductAttention, TELayerNormColumnParallelLinear, TENorm, - TERowParallelGroupedLinear, TERowParallelLinear, ) @@ -47,8 +45,6 @@ try: HAVE_APEX = True LNImpl = FusedLayerNorm except ImportError: - import warnings - from megatron.core.transformer.torch_norm import WrappedTorchNorm warnings.warn('Apex is not installed. Falling back to Torch Norm') @@ -60,7 +56,8 @@ def get_gpt_layer_with_transformer_engine_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, - fp8: Optional[str] = None, + fp8: Optional[str] = None, # pylint: disable=unused-arguments + moe_use_legacy_grouped_gemm: Optional[bool] = False, ) -> ModuleSpec: """Use this spec to use lower-level Transformer Engine modules (required for fp8 training). @@ -69,13 +66,24 @@ def get_gpt_layer_with_transformer_engine_spec( num_experts (int, optional): Number of experts. Defaults to None. moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. - fp8 (str, optional): Flag to decide the linear layer spec for MoE. Defaults to None. + fp8 (str, optional): Deprecated. For temporary Nemo compatibility. + moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. + Defaults to False. Returns: ModuleSpec: Module specification with TE modules """ + if fp8 is not None: + warnings.warn( + 'The fp8 argument in "get_gpt_layer_with_transformer_engine_spec" has been deprecated' + ' and will be removed soon. Please update your code accordingly.' + ) + mlp = _get_mlp_module_spec( - use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8 + use_te=True, + num_experts=num_experts, + moe_grouped_gemm=moe_grouped_gemm, + moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, ) if multi_latent_attention: @@ -138,6 +146,8 @@ def get_gpt_layer_local_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, + fp8: Optional[str] = None, # pylint: disable=unused-arguments + moe_use_legacy_grouped_gemm: Optional[bool] = False, ) -> ModuleSpec: """Use this spec for an implementation using only modules in Megatron-Core. @@ -146,13 +156,24 @@ def get_gpt_layer_local_spec( num_experts (int, optional): Number of experts. Defaults to None. moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. + fp8 (str, optional): Deprecated. For temporary Nemo compatibility. + moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. + Defaults to False. Returns: ModuleSpec: Module specification with Megatron-Core modules """ + if fp8 is not None: + warnings.warn( + 'The fp8 argument in "get_gpt_layer_local_spec" has been deprecated' + ' and will be removed soon. Please update your code accordingly.' + ) mlp = _get_mlp_module_spec( - use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm + use_te=False, + num_experts=num_experts, + moe_grouped_gemm=moe_grouped_gemm, + moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, ) if multi_latent_attention: @@ -213,63 +234,33 @@ def _get_mlp_module_spec( use_te: Optional[bool] = True, num_experts: Optional[int] = None, moe_grouped_gemm: Optional[bool] = False, - fp8: Optional[str] = None, + fp8: Optional[str] = None, # pylint: disable=unused-arguments + moe_use_legacy_grouped_gemm: Optional[bool] = False, ) -> ModuleSpec: - """Helper function to get module spec for MLP""" - if num_experts is not None: - moe_spec = _get_moe_module_spec( - use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8 + """Helper function to get module spec for MLP/MoE""" + if fp8 is not None: + warnings.warn( + 'The fp8 argument in "_get_mlp_module_spec" has been deprecated' + ' and will be removed soon. Please update your code accordingly.' ) - return moe_spec - - return ModuleSpec( - module=MLP, - submodules=MLPSubmodules( - linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear, - linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, - ), - ) - -def _get_moe_module_spec( - use_te: Optional[bool] = True, - num_experts: Optional[int] = None, - moe_grouped_gemm: Optional[bool] = False, - fp8: Optional[str] = None, -) -> ModuleSpec: - """Helper function to get module spec for MoE""" if num_experts is None: - return None - if use_te and moe_grouped_gemm: - linear_fc1 = TEColumnParallelGroupedLinear - linear_fc2 = TERowParallelGroupedLinear - elif use_te and fp8: - linear_fc1 = TEColumnParallelLinear - linear_fc2 = TERowParallelLinear - else: - linear_fc1 = ColumnParallelLinear - linear_fc2 = RowParallelLinear - - use_te_grouped_gemm = use_te and TEColumnParallelGroupedLinear is not None - - return ModuleSpec( - module=MoELayer, - submodules=MoESubmodules( - experts=( - MLPSubmodules(linear_fc1=linear_fc1, linear_fc2=linear_fc2) - if not moe_grouped_gemm or use_te_grouped_gemm - else None - ), - shared_experts=ModuleSpec( - module=SharedExpertMLP, - params={"gate": False}, - submodules=MLPSubmodules( - linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear, - linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, - ), + # Dense MLP w/ or w/o TE modules. + return ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear, + linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, ), - ), - ) + ) + else: + # Mixture of experts with modules in megatron core. + return get_moe_module_spec( + use_te=use_te, + num_experts=num_experts, + moe_grouped_gemm=moe_grouped_gemm, + moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, + ) def get_gpt_decoder_block_spec( @@ -288,7 +279,7 @@ def get_gpt_decoder_block_spec( moe_grouped_gemm=False, qk_layernorm=config.qk_layernorm, multi_latent_attention=config.multi_latent_attention, - fp8=config.fp8, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, ) if use_transformer_engine else get_gpt_layer_local_spec( @@ -296,6 +287,7 @@ def get_gpt_decoder_block_spec( moe_grouped_gemm=False, qk_layernorm=config.qk_layernorm, multi_latent_attention=config.multi_latent_attention, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, ) ) moe_layer_spec = ( @@ -304,7 +296,7 @@ def get_gpt_decoder_block_spec( moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm, multi_latent_attention=config.multi_latent_attention, - fp8=config.fp8, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, ) if use_transformer_engine else get_gpt_layer_local_spec( @@ -312,6 +304,7 @@ def get_gpt_decoder_block_spec( moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm, multi_latent_attention=config.multi_latent_attention, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, ) ) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/gpt/moe_module_specs.py b/megatron/core/models/gpt/moe_module_specs.py new file mode 100644 index 0000000000000000000000000000000000000000..513eeddc7e3a12824d97fd12b3b66a644c3ecee7 --- /dev/null +++ b/megatron/core/models/gpt/moe_module_specs.py @@ -0,0 +1,81 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import warnings +from typing import Optional + +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.mlp import MLPSubmodules +from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP +from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules +from megatron.core.transformer.moe.shared_experts import SharedExpertMLP +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.utils import get_te_version, is_te_min_version + +try: + from megatron.core.extensions.transformer_engine import ( + TEColumnParallelGroupedLinear, + TEColumnParallelLinear, + TERowParallelGroupedLinear, + TERowParallelLinear, + ) + + HAVE_TE = True +except ImportError: + HAVE_TE = False + + +def get_moe_module_spec( + use_te: Optional[bool] = True, + num_experts: Optional[int] = None, + moe_grouped_gemm: Optional[bool] = False, + moe_use_legacy_grouped_gemm: Optional[bool] = False, +) -> ModuleSpec: + """Helper function to get module spec for MoE""" + assert num_experts is not None + + mlp = MLPSubmodules( + linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear, + linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, + ) + + # experts spec + if moe_grouped_gemm: + ## use GroupedMLP + if use_te and TEColumnParallelGroupedLinear is not None and not moe_use_legacy_grouped_gemm: + ## use TEGroupedLinear + expert_module = TEGroupedMLP + expert_submodule = MLPSubmodules( + linear_fc1=TEColumnParallelGroupedLinear, linear_fc2=TERowParallelGroupedLinear + ) + else: + ## use legacy GroupedMLP + expert_module = GroupedMLP + expert_submodule = None + warnings.warn( + 'The legacy GroupedMLP will be deprecated in Megatron-Core v0.12.0. ' + 'Please update the TransformerEngine to version>=1.7.0 and use TEGroupedMLP.' + ) + else: + ## use SequentialMLP + expert_module = SequentialMLP + if use_te and not is_te_min_version("1.7.0.dev0"): + warnings.warn( + "Only transformer-engine>=1.7.0 supports MoE experts, " + f"but your version is {get_te_version()}. Use local linear implementation instead." + ) + expert_submodule = MLPSubmodules( + linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear + ) + else: + expert_submodule = mlp + + experts = ModuleSpec(module=expert_module, submodules=expert_submodule) + + # shared experts spec + shared_experts = ModuleSpec(module=SharedExpertMLP, params={"gate": False}, submodules=mlp) + + # MoE module spec + moe_module_spec = ModuleSpec( + module=MoELayer, submodules=MoESubmodules(experts=experts, shared_experts=shared_experts) + ) + return moe_module_spec diff --git a/megatron/core/models/mamba/__init__.py b/megatron/core/models/mamba/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/multimodal/__init__.py b/megatron/core/models/multimodal/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py old mode 100755 new mode 100644 index dafe377456d81eea1973328de8dfe60da6761d5c..3de68b5091719cfd3638cf38ea37ab1963be2fda --- a/megatron/core/models/multimodal/llava_model.py +++ b/megatron/core/models/multimodal/llava_model.py @@ -36,6 +36,7 @@ IGNORE_INDEX = -100 # ID for labels that should be ignored. # Image token index can be tokenizer dependent so the default value does not work in all cases. DEFAULT_IMAGE_TOKEN_INDEX = -200 IMAGE_TOKEN = "" +VIDEO_TOKEN = "