diff --git a/docker/Dockerfile b/docker/Dockerfile index 71cef521b..cc2ccc11c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -582,7 +582,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # This is ~1.1GB and only changes when FlashInfer version bumps # https://docs.flashinfer.ai/installation.html # From versions.json: .flashinfer.version -ARG FLASHINFER_VERSION=0.6.3 +ARG FLASHINFER_VERSION=0.6.4 RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \ && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \ diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index b4d590016..6f6f147c4 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -217,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. # build flashinfer for torch nightly from source around 10 mins -# release version: v0.6.3 +# release version: v0.6.4 # todo(elainewy): cache flashinfer build result for faster build ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ echo "git clone flashinfer..." \ - && git clone --depth 1 --branch v0.6.3 --recursive https://github.com/flashinfer-ai/flashinfer.git \ + && git clone --depth 1 --branch v0.6.4 --recursive https://github.com/flashinfer-ai/flashinfer.git \ && cd flashinfer \ && git submodule update --init --recursive \ && echo "finish git clone flashinfer..." \ diff --git a/docker/versions.json b/docker/versions.json index 6277e0b6f..24f4b6e7d 100644 --- a/docker/versions.json +++ b/docker/versions.json @@ -68,7 +68,7 @@ "default": "true" }, "FLASHINFER_VERSION": { - "default": "0.6.3" + "default": "0.6.4" }, "GDRCOPY_CUDA_VERSION": { "default": "12.8" diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 15e4ebbf4..84fe34730 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -10,4 +10,4 @@ torchaudio==2.10.0 # These must be updated alongside torch torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile -flashinfer-python==0.6.3 +flashinfer-python==0.6.4 diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index e67a77005..27cf3a792 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -536,34 +536,12 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): ) -class DeepseekV3ForCausalLM(VerifyAndUpdateConfig): - @classmethod - def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: - """Disable AR-RMS-Quant fusion for DeepSeekV3 in NVFP4""" - # TODO: https://github.com/vllm-project/vllm/issues/34395 - - # disable AR-rms-fp4 fusion for DSv3+ - ar_rms_enabled = vllm_config.compilation_config.pass_config.fuse_allreduce_rms - nvfp4 = vllm_config.model_config.is_nvfp4_quantized() - - # Disable by default, warn if manually enabled: - if ar_rms_enabled is None and nvfp4: - vllm_config.compilation_config.pass_config.fuse_allreduce_rms = False - if ar_rms_enabled and nvfp4: - logger.warning( - "Allreduce-rms fusion broken for DeepSeekV3 with NVFP4 quant," - "see https://github.com/vllm-project/vllm/issues/34395." - ) - - -class DeepseekV32ForCausalLM(DeepseekV3ForCausalLM): +class DeepseekV32ForCausalLM(VerifyAndUpdateConfig): @classmethod def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: """ Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32 """ - super().verify_and_update_config(vllm_config) - hf_config = vllm_config.model_config.hf_config # Mirror the check in vllm/model_executor/models/deepseek_v2.py @@ -654,7 +632,6 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "MambaForCausalLM": MambaModelConfig, "Mamba2ForCausalLM": MambaModelConfig, "FalconMambaForCausalLM": MambaModelConfig, - "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM, "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM, "NemotronHForCausalLM": NemotronHForCausalLMConfig, "NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig,