more fixes

This commit is contained in:
2026-05-14 19:35:39 +00:00
parent 3be051e140
commit 1ceff541b0
3 changed files with 18 additions and 19 deletions

View File

@@ -15,6 +15,8 @@ ENV TORCH_CUDA_ARCH_LIST="10.0"
# Clone latest CUTLASS (has NVFP4 block-scaled MMA support)
RUN git clone --depth 1 https://github.com/NVIDIA/cutlass.git /root/cutlass
ARG CACHE_BUSTER=${TIMESTAMP}
# Copy and install the NVFP4 mega_moe kernel (from this repo)
COPY src/ /root/nvfp4-megamoe-kernel/src/
COPY pyproject.toml /root/nvfp4-megamoe-kernel/pyproject.toml
@@ -32,18 +34,16 @@ RUN pip install tilelang
ENV PYTHONPATH="/root/nvfp4-megamoe-kernel/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm:/root/nvfp4-megamoe-kernel:${PYTHONPATH}"
# Copy vLLM patches
COPY vllm/patches/deepseek_v4.py /tmp/patches/deepseek_v4.py
COPY vllm/patches/staging_kernel.py /tmp/patches/staging_kernel.py
COPY vllm/patches/deepseek_v4_attention.py /tmp/patches/deepseek_v4_attention.py
# Patch vLLM — overwrite model files and register architecture
ARG VLLM_MODELS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models
ARG VLLM_LAYERS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers
# Apply patches
RUN VLLM_MODELS_DIR=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \
VLLM_LAYERS_DIR=$(python3 -c "import vllm.model_executor.layers; import os; print(os.path.dirname(vllm.model_executor.layers.__file__))") && \
cp /tmp/patches/deepseek_v4.py "$VLLM_MODELS_DIR/deepseek_v4.py" && \
cp /tmp/patches/staging_kernel.py "$VLLM_MODELS_DIR/staging_kernel.py" && \
cp /tmp/patches/deepseek_v4_attention.py "$VLLM_LAYERS_DIR/deepseek_v4_attention.py" && \
rm -rf /tmp/patches
COPY vllm/patches/deepseek_v4.py ${VLLM_MODELS_DIR}/deepseek_v4.py
COPY vllm/patches/staging_kernel.py ${VLLM_MODELS_DIR}/staging_kernel.py
COPY vllm/patches/deepseek_v4_attention.py ${VLLM_LAYERS_DIR}/deepseek_v4_attention.py
RUN sed -i 's/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),\n "DeepseekV4ForCausalLM": ("deepseek_v4", "DeepseekV4ForCausalLM"),/' \
${VLLM_MODELS_DIR}/registry.py
# Verify
RUN python3 -c "import torch; import cutlass_nvfp4_gemm._C; print('CUTLASS NVFP4 OK')" && \

View File

@@ -3,17 +3,20 @@ set -euo pipefail
cd "$(dirname "$0")"
# Bust any ARG cache busters in Dockerfile by replacing with timestamp
# Bust the Docker build cache by injecting a timestamp
TIMESTAMP=$(date +%s)
sed -i -E "s/ARG [A-Z_]+CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile
sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile
echo "=== Stopping existing container ==="
docker compose down 2>/dev/null || true
echo "=== Building (no cache) ==="
docker compose build --no-cache
echo "=== Building ==="
docker compose build
echo "=== Starting ==="
docker compose up -d
# Restore Dockerfile so git diff stays clean
sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=\${TIMESTAMP}/" Dockerfile
echo "=== Done. Container: $(docker compose ps -q) ==="

View File

@@ -2214,7 +2214,3 @@ class DeepseekV4ForCausalLM(nn.Module):
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
return self.model.get_expert_mapping()
# Register model architecture with vLLM
from vllm.model_executor.models import ModelRegistry
ModelRegistry.register_model("DeepseekV4ForCausalLM", DeepseekV4ForCausalLM)