more fixes

2026-05-14 19:35:39 +00:00
parent 3be051e140
commit 1ceff541b0
3 changed files with 18 additions and 19 deletions
--- a/22
+++ b/22
@@ -15,6 +15,8 @@ ENV TORCH_CUDA_ARCH_LIST="10.0"
 # Clone latest CUTLASS (has NVFP4 block-scaled MMA support)
 RUN git clone --depth 1 https://github.com/NVIDIA/cutlass.git /root/cutlass

+ARG CACHE_BUSTER=${TIMESTAMP}
+
 # Copy and install the NVFP4 mega_moe kernel (from this repo)
 COPY src/ /root/nvfp4-megamoe-kernel/src/
 COPY pyproject.toml /root/nvfp4-megamoe-kernel/pyproject.toml
@@ -32,18 +34,16 @@ RUN pip install tilelang

 ENV PYTHONPATH="/root/nvfp4-megamoe-kernel/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm:/root/nvfp4-megamoe-kernel:${PYTHONPATH}"

-# Copy vLLM patches
-COPY vllm/patches/deepseek_v4.py /tmp/patches/deepseek_v4.py
-COPY vllm/patches/staging_kernel.py /tmp/patches/staging_kernel.py
-COPY vllm/patches/deepseek_v4_attention.py /tmp/patches/deepseek_v4_attention.py
+# Patch vLLM — overwrite model files and register architecture
+ARG VLLM_MODELS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models
+ARG VLLM_LAYERS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers

-# Apply patches
-RUN VLLM_MODELS_DIR=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \
-    VLLM_LAYERS_DIR=$(python3 -c "import vllm.model_executor.layers; import os; print(os.path.dirname(vllm.model_executor.layers.__file__))") && \
-    cp /tmp/patches/deepseek_v4.py "$VLLM_MODELS_DIR/deepseek_v4.py" && \
-    cp /tmp/patches/staging_kernel.py "$VLLM_MODELS_DIR/staging_kernel.py" && \
-    cp /tmp/patches/deepseek_v4_attention.py "$VLLM_LAYERS_DIR/deepseek_v4_attention.py" && \
-    rm -rf /tmp/patches
+COPY vllm/patches/deepseek_v4.py ${VLLM_MODELS_DIR}/deepseek_v4.py
+COPY vllm/patches/staging_kernel.py ${VLLM_MODELS_DIR}/staging_kernel.py
+COPY vllm/patches/deepseek_v4_attention.py ${VLLM_LAYERS_DIR}/deepseek_v4_attention.py
+
+RUN sed -i 's/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),\n    "DeepseekV4ForCausalLM": ("deepseek_v4", "DeepseekV4ForCausalLM"),/' \
+    ${VLLM_MODELS_DIR}/registry.py

 # Verify
 RUN python3 -c "import torch; import cutlass_nvfp4_gemm._C; print('CUTLASS NVFP4 OK')" && \
--- a/build_and_run.sh
+++ b/build_and_run.sh
@@ -3,17 +3,20 @@ set -euo pipefail

 cd "$(dirname "$0")"

-# Bust any ARG cache busters in Dockerfile by replacing with timestamp
+# Bust the Docker build cache by injecting a timestamp
 TIMESTAMP=$(date +%s)
-sed -i -E "s/ARG [A-Z_]+CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile
+sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile

 echo "=== Stopping existing container ==="
 docker compose down 2>/dev/null || true

-echo "=== Building (no cache) ==="
-docker compose build --no-cache
+echo "=== Building ==="
+docker compose build

 echo "=== Starting ==="
 docker compose up -d

+# Restore Dockerfile so git diff stays clean
+sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=\${TIMESTAMP}/" Dockerfile
+
 echo "=== Done. Container: $(docker compose ps -q) ==="
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -2214,7 +2214,3 @@ class DeepseekV4ForCausalLM(nn.Module):
    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
        return self.model.get_expert_mapping()

-
-# Register model architecture with vLLM
-from vllm.model_executor.models import ModelRegistry
-ModelRegistry.register_model("DeepseekV4ForCausalLM", DeepseekV4ForCausalLM)