more fixes
This commit is contained in:
22
Dockerfile
22
Dockerfile
@@ -15,6 +15,8 @@ ENV TORCH_CUDA_ARCH_LIST="10.0"
|
||||
# Clone latest CUTLASS (has NVFP4 block-scaled MMA support)
|
||||
RUN git clone --depth 1 https://github.com/NVIDIA/cutlass.git /root/cutlass
|
||||
|
||||
ARG CACHE_BUSTER=${TIMESTAMP}
|
||||
|
||||
# Copy and install the NVFP4 mega_moe kernel (from this repo)
|
||||
COPY src/ /root/nvfp4-megamoe-kernel/src/
|
||||
COPY pyproject.toml /root/nvfp4-megamoe-kernel/pyproject.toml
|
||||
@@ -32,18 +34,16 @@ RUN pip install tilelang
|
||||
|
||||
ENV PYTHONPATH="/root/nvfp4-megamoe-kernel/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm:/root/nvfp4-megamoe-kernel:${PYTHONPATH}"
|
||||
|
||||
# Copy vLLM patches
|
||||
COPY vllm/patches/deepseek_v4.py /tmp/patches/deepseek_v4.py
|
||||
COPY vllm/patches/staging_kernel.py /tmp/patches/staging_kernel.py
|
||||
COPY vllm/patches/deepseek_v4_attention.py /tmp/patches/deepseek_v4_attention.py
|
||||
# Patch vLLM — overwrite model files and register architecture
|
||||
ARG VLLM_MODELS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models
|
||||
ARG VLLM_LAYERS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers
|
||||
|
||||
# Apply patches
|
||||
RUN VLLM_MODELS_DIR=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \
|
||||
VLLM_LAYERS_DIR=$(python3 -c "import vllm.model_executor.layers; import os; print(os.path.dirname(vllm.model_executor.layers.__file__))") && \
|
||||
cp /tmp/patches/deepseek_v4.py "$VLLM_MODELS_DIR/deepseek_v4.py" && \
|
||||
cp /tmp/patches/staging_kernel.py "$VLLM_MODELS_DIR/staging_kernel.py" && \
|
||||
cp /tmp/patches/deepseek_v4_attention.py "$VLLM_LAYERS_DIR/deepseek_v4_attention.py" && \
|
||||
rm -rf /tmp/patches
|
||||
COPY vllm/patches/deepseek_v4.py ${VLLM_MODELS_DIR}/deepseek_v4.py
|
||||
COPY vllm/patches/staging_kernel.py ${VLLM_MODELS_DIR}/staging_kernel.py
|
||||
COPY vllm/patches/deepseek_v4_attention.py ${VLLM_LAYERS_DIR}/deepseek_v4_attention.py
|
||||
|
||||
RUN sed -i 's/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),\n "DeepseekV4ForCausalLM": ("deepseek_v4", "DeepseekV4ForCausalLM"),/' \
|
||||
${VLLM_MODELS_DIR}/registry.py
|
||||
|
||||
# Verify
|
||||
RUN python3 -c "import torch; import cutlass_nvfp4_gemm._C; print('CUTLASS NVFP4 OK')" && \
|
||||
|
||||
@@ -3,17 +3,20 @@ set -euo pipefail
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# Bust any ARG cache busters in Dockerfile by replacing with timestamp
|
||||
# Bust the Docker build cache by injecting a timestamp
|
||||
TIMESTAMP=$(date +%s)
|
||||
sed -i -E "s/ARG [A-Z_]+CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile
|
||||
sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=${TIMESTAMP}/" Dockerfile
|
||||
|
||||
echo "=== Stopping existing container ==="
|
||||
docker compose down 2>/dev/null || true
|
||||
|
||||
echo "=== Building (no cache) ==="
|
||||
docker compose build --no-cache
|
||||
echo "=== Building ==="
|
||||
docker compose build
|
||||
|
||||
echo "=== Starting ==="
|
||||
docker compose up -d
|
||||
|
||||
# Restore Dockerfile so git diff stays clean
|
||||
sed -i -E "s/ARG CACHE_BUSTER=.*/ARG CACHE_BUSTER=\${TIMESTAMP}/" Dockerfile
|
||||
|
||||
echo "=== Done. Container: $(docker compose ps -q) ==="
|
||||
|
||||
@@ -2214,7 +2214,3 @@ class DeepseekV4ForCausalLM(nn.Module):
|
||||
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
|
||||
return self.model.get_expert_mapping()
|
||||
|
||||
|
||||
# Register model architecture with vLLM
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
ModelRegistry.register_model("DeepseekV4ForCausalLM", DeepseekV4ForCausalLM)
|
||||
|
||||
Reference in New Issue
Block a user