55 lines
2.5 KiB
Docker
55 lines
2.5 KiB
Docker
# DeepSeek V4 NVFP4 vLLM + CUTLASS NVFP4 Mega MoE Kernel
|
|
FROM vllm/vllm-openai:nightly-x86_64
|
|
|
|
# Remove broken nixl_ep (built against CUDA 12, image is CUDA 13)
|
|
RUN pip uninstall -y nixl-ep; rm -rf /usr/local/lib/python3.12/dist-packages/nixl_ep
|
|
|
|
RUN apt-get update && apt-get install -y git screen cmake libcusolver-dev-13-0 libcusparse-dev-13-0 libcublas-dev-13-0 libcurand-dev-13-0 libcufft-dev-13-0 libnvjitlink-dev-13-0 && rm -rf /var/lib/apt/lists/*
|
|
|
|
# Remove the broken symlink if it exists
|
|
RUN rm -f /usr/local/cuda/lib64/libcudart.so.12
|
|
|
|
ENV CUDA_HOME=/usr/local/cuda
|
|
ENV TORCH_CUDA_ARCH_LIST="10.0"
|
|
|
|
# Clone latest CUTLASS (has NVFP4 block-scaled MMA support)
|
|
ARG CUTLASS_CACHE_BUSTER=1
|
|
RUN git clone --depth 1 https://github.com/NVIDIA/cutlass.git /root/cutlass
|
|
|
|
# Clone our NVFP4 mega_moe kernel
|
|
ARG KERNEL_CACHE_BUSTER=24
|
|
RUN git clone https://sweetapi.com/biondizzle/nvfp4-megamoe-kernel.git /root/nvfp4-megamoe-kernel && \
|
|
cd /root/nvfp4-megamoe-kernel && \
|
|
pip install -e .
|
|
|
|
# Build the CUTLASS NVFP4 block-scaled GEMM extension
|
|
RUN cd /root/nvfp4-megamoe-kernel/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm && \
|
|
mkdir -p cutlass_nvfp4_gemm && \
|
|
CUTLASS_INCLUDE_DIR=/root/cutlass/include \
|
|
TORCH_CUDA_ARCH_LIST=10.0 \
|
|
python3 setup.py build_ext --inplace
|
|
|
|
# Install TileLang (for potential future use)
|
|
RUN pip install tilelang
|
|
|
|
ENV PYTHONPATH="/root/nvfp4-megamoe-kernel/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm:/root/nvfp4-megamoe-kernel:${PYTHONPATH}"
|
|
|
|
# Copy patches
|
|
ARG PATCH_CACHE_BUSTER=82
|
|
COPY patches/deepseek_v4.py /tmp/patches/deepseek_v4.py
|
|
COPY patches/staging_kernel.py /tmp/patches/staging_kernel.py
|
|
COPY patches/deepseek_v4_attention.py /tmp/patches/deepseek_v4_attention.py
|
|
|
|
# Apply patches
|
|
RUN VLLM_MODELS_DIR=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \
|
|
VLLM_LAYERS_DIR=$(python3 -c "import vllm.model_executor.layers; import os; print(os.path.dirname(vllm.model_executor.layers.__file__))") && \
|
|
cp /tmp/patches/deepseek_v4.py "$VLLM_MODELS_DIR/deepseek_v4.py" && \
|
|
cp /tmp/patches/staging_kernel.py "$VLLM_MODELS_DIR/staging_kernel.py" && \
|
|
cp /tmp/patches/deepseek_v4_attention.py "$VLLM_LAYERS_DIR/deepseek_v4_attention.py" && \
|
|
rm -rf /tmp/patches
|
|
|
|
# Verify
|
|
RUN python3 -c "import torch; import cutlass_nvfp4_gemm._C; print('CUTLASS NVFP4 OK')" && \
|
|
python3 -c "import vllm; print('vLLM OK')" && \
|
|
python3 -c "import nvfp4_megamoe_kernel; print('NVFP4 kernel OK')"
|