diff --git a/Dockerfile b/Dockerfile
index 2c47495e..fe4962ff 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,6 +15,8 @@ ENV TORCH_CUDA_ARCH_LIST="10.0"
 # Clone latest CUTLASS (has NVFP4 block-scaled MMA support)
 RUN git clone --depth 1 https://github.com/NVIDIA/cutlass.git /root/cutlass
 
+ARG CACHE_BUSTER=${TIMESTAMP}
+
 # Copy and install the NVFP4 mega_moe kernel (from this repo)
 COPY src/ /root/nvfp4-megamoe-kernel/src/
 COPY pyproject.toml /root/nvfp4-megamoe-kernel/pyproject.toml
@@ -32,18 +34,16 @@ RUN pip install tilelang
 
 ENV PYTHONPATH="/root/nvfp4-megamoe-kernel/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm:/root/nvfp4-megamoe-kernel:${PYTHONPATH}"
 
-# Copy vLLM patches
-COPY vllm/patches/deepseek_v4.py /tmp/patches/deepseek_v4.py
-COPY vllm/patches/staging_kernel.py /tmp/patches/staging_kernel.py
-COPY vllm/patches/deepseek_v4_attention.py /tmp/patches/deepseek_v4_attention.py
+# Patch vLLM — overwrite model files and register architecture
+ARG VLLM_MODELS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models
+ARG VLLM_LAYERS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers
 
-# Apply patches
-RUN VLLM_MODELS_DIR=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \
-    VLLM_LAYERS_DIR=$(python3 -c "import vllm.model_executor.layers; import os; print(os.path.dirname(vllm.model_executor.layers.__file__))") && \
-    cp /tmp/patches/deepseek_v4.py "$VLLM_MODELS_DIR/deepseek_v4.py" && \
-    cp /tmp/patches/staging_kernel.py "$VLLM_MODELS_DIR/staging_kernel.py" && \
-    cp /tmp/patches/deepseek_v4_attention.py "$VLLM_LAYERS_DIR/deepseek_v4_attention.py" && \
-    rm -rf /tmp/patches
+COPY vllm/patches/deepseek_v4.py ${VLLM_MODELS_DIR}/deepseek_v4.py
+COPY vllm/patches/staging_kernel.py ${VLLM_MODELS_DIR}/staging_kernel.py
+COPY vllm/patches/deepseek_v4_attention.py ${VLLM_LAYERS_DIR}/deepseek_v4_attention.py
+
+RUN sed -i 's/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),\n    "DeepseekV4ForCausalLM": ("deepseek_v4", "DeepseekV4ForCausalLM"),/' \
+    ${VLLM_MODELS_DIR}/registry.py
 
 # Verify
 RUN python3 -c "import torch; import cutlass_nvfp4_gemm._C; print('CUTLASS NVFP4 OK')" && \
diff --git a/build_and_run.sh b/build_and_run.sh
index ab4fbd71..cf593fb1 100755
--- a/build_and_run.sh
+++ b/build_and_run.sh
@@ -3,17 +3,20 @@ set -euo pipefail
 
 cd "$(dirname "$0")"
 
-# Bust any ARG cache busters in Dockerfile by replacing with timestamp
+# Bust the Docker build cache by injecting a timestamp
 TIMESTAMP=$(date +%s)
-sed -i -E "s/ARG [A-Z_]+CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile
+sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile
 
 echo "=== Stopping existing container ==="
 docker compose down 2>/dev/null || true
 
-echo "=== Building (no cache) ==="
-docker compose build --no-cache
+echo "=== Building ==="
+docker compose build
 
 echo "=== Starting ==="
 docker compose up -d
 
+# Restore Dockerfile so git diff stays clean
+sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=\${TIMESTAMP}/" Dockerfile
+
 echo "=== Done. Container: $(docker compose ps -q) ==="
diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py
index 2d76d299..6db66b6b 100644
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -2214,7 +2214,3 @@ class DeepseekV4ForCausalLM(nn.Module):
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return self.model.get_expert_mapping()
 
-
-# Register model architecture with vLLM
-from vllm.model_executor.models import ModelRegistry
-ModelRegistry.register_model("DeepseekV4ForCausalLM", DeepseekV4ForCausalLM)