diff --git a/Dockerfile b/Dockerfile index 2c47495e..fe4962ff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,8 @@ ENV TORCH_CUDA_ARCH_LIST="10.0" # Clone latest CUTLASS (has NVFP4 block-scaled MMA support) RUN git clone --depth 1 https://github.com/NVIDIA/cutlass.git /root/cutlass +ARG CACHE_BUSTER=${TIMESTAMP} + # Copy and install the NVFP4 mega_moe kernel (from this repo) COPY src/ /root/nvfp4-megamoe-kernel/src/ COPY pyproject.toml /root/nvfp4-megamoe-kernel/pyproject.toml @@ -32,18 +34,16 @@ RUN pip install tilelang ENV PYTHONPATH="/root/nvfp4-megamoe-kernel/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm:/root/nvfp4-megamoe-kernel:${PYTHONPATH}" -# Copy vLLM patches -COPY vllm/patches/deepseek_v4.py /tmp/patches/deepseek_v4.py -COPY vllm/patches/staging_kernel.py /tmp/patches/staging_kernel.py -COPY vllm/patches/deepseek_v4_attention.py /tmp/patches/deepseek_v4_attention.py +# Patch vLLM — overwrite model files and register architecture +ARG VLLM_MODELS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models +ARG VLLM_LAYERS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers -# Apply patches -RUN VLLM_MODELS_DIR=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \ - VLLM_LAYERS_DIR=$(python3 -c "import vllm.model_executor.layers; import os; print(os.path.dirname(vllm.model_executor.layers.__file__))") && \ - cp /tmp/patches/deepseek_v4.py "$VLLM_MODELS_DIR/deepseek_v4.py" && \ - cp /tmp/patches/staging_kernel.py "$VLLM_MODELS_DIR/staging_kernel.py" && \ - cp /tmp/patches/deepseek_v4_attention.py "$VLLM_LAYERS_DIR/deepseek_v4_attention.py" && \ - rm -rf /tmp/patches +COPY vllm/patches/deepseek_v4.py ${VLLM_MODELS_DIR}/deepseek_v4.py +COPY vllm/patches/staging_kernel.py ${VLLM_MODELS_DIR}/staging_kernel.py +COPY vllm/patches/deepseek_v4_attention.py ${VLLM_LAYERS_DIR}/deepseek_v4_attention.py + +RUN sed -i 's/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),\n "DeepseekV4ForCausalLM": ("deepseek_v4", "DeepseekV4ForCausalLM"),/' \ + ${VLLM_MODELS_DIR}/registry.py # Verify RUN python3 -c "import torch; import cutlass_nvfp4_gemm._C; print('CUTLASS NVFP4 OK')" && \ diff --git a/build_and_run.sh b/build_and_run.sh index ab4fbd71..cf593fb1 100755 --- a/build_and_run.sh +++ b/build_and_run.sh @@ -3,17 +3,20 @@ set -euo pipefail cd "$(dirname "$0")" -# Bust any ARG cache busters in Dockerfile by replacing with timestamp +# Bust the Docker build cache by injecting a timestamp TIMESTAMP=$(date +%s) -sed -i -E "s/ARG [A-Z_]+CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile +sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile echo "=== Stopping existing container ===" docker compose down 2>/dev/null || true -echo "=== Building (no cache) ===" -docker compose build --no-cache +echo "=== Building ===" +docker compose build echo "=== Starting ===" docker compose up -d +# Restore Dockerfile so git diff stays clean +sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=\${TIMESTAMP}/" Dockerfile + echo "=== Done. Container: $(docker compose ps -q) ===" diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index 2d76d299..6db66b6b 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -2214,7 +2214,3 @@ class DeepseekV4ForCausalLM(nn.Module): def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: return self.model.get_expert_mapping() - -# Register model architecture with vLLM -from vllm.model_executor.models import ModelRegistry -ModelRegistry.register_model("DeepseekV4ForCausalLM", DeepseekV4ForCausalLM)