Unverified Commit ea5f903f authored by Wei Zhao's avatar Wei Zhao Committed by GitHub
Browse files

Bump Flashinfer Version and Re-enable DeepSeek NVFP4 AR+Norm Fusion (#34899)


Signed-off-by: default avatarwzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: default avatarCyrus Leung <tlleungac@connect.ust.hk>
parent 0632ed87
...@@ -582,7 +582,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -582,7 +582,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# This is ~1.1GB and only changes when FlashInfer version bumps # This is ~1.1GB and only changes when FlashInfer version bumps
# https://docs.flashinfer.ai/installation.html # https://docs.flashinfer.ai/installation.html
# From versions.json: .flashinfer.version # From versions.json: .flashinfer.version
ARG FLASHINFER_VERSION=0.6.3 ARG FLASHINFER_VERSION=0.6.4
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \ uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
&& uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \ && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
......
...@@ -217,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. ...@@ -217,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
# build flashinfer for torch nightly from source around 10 mins # build flashinfer for torch nightly from source around 10 mins
# release version: v0.6.3 # release version: v0.6.4
# todo(elainewy): cache flashinfer build result for faster build # todo(elainewy): cache flashinfer build result for faster build
ENV CCACHE_DIR=/root/.cache/ccache ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \ RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
echo "git clone flashinfer..." \ echo "git clone flashinfer..." \
&& git clone --depth 1 --branch v0.6.3 --recursive https://github.com/flashinfer-ai/flashinfer.git \ && git clone --depth 1 --branch v0.6.4 --recursive https://github.com/flashinfer-ai/flashinfer.git \
&& cd flashinfer \ && cd flashinfer \
&& git submodule update --init --recursive \ && git submodule update --init --recursive \
&& echo "finish git clone flashinfer..." \ && echo "finish git clone flashinfer..." \
......
...@@ -68,7 +68,7 @@ ...@@ -68,7 +68,7 @@
"default": "true" "default": "true"
}, },
"FLASHINFER_VERSION": { "FLASHINFER_VERSION": {
"default": "0.6.3" "default": "0.6.4"
}, },
"GDRCOPY_CUDA_VERSION": { "GDRCOPY_CUDA_VERSION": {
"default": "12.8" "default": "12.8"
......
...@@ -10,4 +10,4 @@ torchaudio==2.10.0 ...@@ -10,4 +10,4 @@ torchaudio==2.10.0
# These must be updated alongside torch # These must be updated alongside torch
torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# FlashInfer should be updated together with the Dockerfile # FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.6.3 flashinfer-python==0.6.4
...@@ -536,34 +536,12 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): ...@@ -536,34 +536,12 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
) )
class DeepseekV3ForCausalLM(VerifyAndUpdateConfig): class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
"""Disable AR-RMS-Quant fusion for DeepSeekV3 in NVFP4"""
# TODO: https://github.com/vllm-project/vllm/issues/34395
# disable AR-rms-fp4 fusion for DSv3+
ar_rms_enabled = vllm_config.compilation_config.pass_config.fuse_allreduce_rms
nvfp4 = vllm_config.model_config.is_nvfp4_quantized()
# Disable by default, warn if manually enabled:
if ar_rms_enabled is None and nvfp4:
vllm_config.compilation_config.pass_config.fuse_allreduce_rms = False
if ar_rms_enabled and nvfp4:
logger.warning(
"Allreduce-rms fusion broken for DeepSeekV3 with NVFP4 quant,"
"see https://github.com/vllm-project/vllm/issues/34395."
)
class DeepseekV32ForCausalLM(DeepseekV3ForCausalLM):
@classmethod @classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
""" """
Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32 Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32
""" """
super().verify_and_update_config(vllm_config)
hf_config = vllm_config.model_config.hf_config hf_config = vllm_config.model_config.hf_config
# Mirror the check in vllm/model_executor/models/deepseek_v2.py # Mirror the check in vllm/model_executor/models/deepseek_v2.py
...@@ -654,7 +632,6 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { ...@@ -654,7 +632,6 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"MambaForCausalLM": MambaModelConfig, "MambaForCausalLM": MambaModelConfig,
"Mamba2ForCausalLM": MambaModelConfig, "Mamba2ForCausalLM": MambaModelConfig,
"FalconMambaForCausalLM": MambaModelConfig, "FalconMambaForCausalLM": MambaModelConfig,
"DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
"DeepseekV32ForCausalLM": DeepseekV32ForCausalLM, "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
"NemotronHForCausalLM": NemotronHForCausalLMConfig, "NemotronHForCausalLM": NemotronHForCausalLMConfig,
"NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig, "NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment