# DeepSeek V4 NVFP4 vLLM + CuTeDSL NVFP4 MoE Kernel FROM vllm/vllm-openai:nightly-x86_64 # Remove broken nixl_ep (built against CUDA 12, image is CUDA 13) RUN pip uninstall -y nixl-ep; rm -rf /usr/local/lib/python3.12/dist-packages/nixl_ep RUN apt-get update && apt-get install -y git screen cmake libcusolver-dev-13-0 libcusparse-dev-13-0 libcublas-dev-13-0 libcurand-dev-13-0 libcufft-dev-13-0 libnvjitlink-dev-13-0 && rm -rf /var/lib/apt/lists/* # Remove the broken symlink if it exists RUN rm -f /usr/local/cuda/lib64/libcudrt.so.12 ENV CUDA_HOME=/usr/local/cuda ENV TORCH_CUDA_ARCH_LIST="10.0" # Install CuTeDSL (NVFP4 block-scaled GEMM kernel framework) RUN pip install nvidia-cutlass-dsl==4.5.0 nvidia-cutlass-dsl-libs-base==4.5.0 ARG CACHE_BUSTER=${TIMESTAMP} # Copy the NVFP4 mega_moe Python kernel (no C++ build needed) COPY src/ /root/nvfp4-megamoe-kernel/src/ COPY pyproject.toml /root/nvfp4-megamoe-kernel/pyproject.toml RUN cd /root/nvfp4-megamoe-kernel && pip install -e . # Copy the CuTeDSL kernel and bridge layer COPY cutedsl/ /root/nvfp4-megamoe-kernel/cutedsl/ ENV PYTHONPATH="/root/nvfp4-megamoe-kernel:${PYTHONPATH}" # Patch vLLM — overwrite model files and register architecture ARG VLLM_MODELS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models ARG VLLM_LAYERS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers ARG VLLM_QUANT_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization ARG VLLM_FUSED_MOE_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe ARG VLLM_LOADER_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader # Core model patches COPY vllm/patches/deepseek_v4.py ${VLLM_MODELS_DIR}/deepseek_v4.py COPY vllm/patches/deepseek_v4_attention.py ${VLLM_LAYERS_DIR}/deepseek_v4_attention.py COPY vllm/patches/layers/deepseek_compressor.py ${VLLM_LAYERS_DIR}/deepseek_compressor.py # Replace MHC TileLang kernels with pure PyTorch (avoids TileLang JIT on Blackwell) # The nightly image has all MHC in layers/mhc.py (imports tilelang at top level). # Our replacement is pure PyTorch — no tilelang dependency at all. COPY vllm/patches/layers/mhc.py ${VLLM_LAYERS_DIR}/mhc.py # CSA/HCA attention kernel (replaces FlashMLA on Blackwell) COPY vllm/patches/layers/csa_attention.py ${VLLM_LAYERS_DIR}/csa_attention.py # CuTeDSL NVFP4 linear kernel (registered as NvFp4LinearKernel) ARG VLLM_NVFP4_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/nvfp4 COPY vllm/kernels/linear/nvfp4/cutedsl.py ${VLLM_NVFP4_DIR}/cutedsl.py # Patch KV cache utils to handle DeepseekV4 SWA page sizes > MLA page sizes # (SWA layers have larger page sizes than compressed MLA layers on Blackwell) ARG VLLM_CORE_DIR=/usr/local/lib/python3.12/dist-packages/vllm/v1/core COPY vllm/patches/patch_kv_cache_utils.py /tmp/patch_kv_cache_utils.py RUN python3 /tmp/patch_kv_cache_utils.py ${VLLM_CORE_DIR}/kv_cache_utils.py && rm /tmp/patch_kv_cache_utils.py # Patch SWA cache and Indexer cache for Blackwell (no FlashMLA alignment) ARG VLLM_SPARSE_SWA_DIR=/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/mla ARG VLLM_LAYERS_DIR2=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers COPY vllm/patches/patch_swa_cache.py /tmp/patch_swa_cache.py RUN python3 /tmp/patch_swa_cache.py ${VLLM_SPARSE_SWA_DIR}/sparse_swa.py && rm /tmp/patch_swa_cache.py COPY vllm/patches/patch_indexer_cache.py /tmp/patch_indexer_cache.py RUN python3 /tmp/patch_indexer_cache.py ${VLLM_LAYERS_DIR2}/deepseek_v4_attention.py && rm /tmp/patch_indexer_cache.py COPY vllm/patches/patch_compressor_cache.py /tmp/patch_compressor_cache.py RUN python3 /tmp/patch_compressor_cache.py ${VLLM_LAYERS_DIR2}/deepseek_compressor.py && rm /tmp/patch_compressor_cache.py # Debug: print layer name mismatch ARG VLLM_WORKER_DIR=/usr/local/lib/python3.12/dist-packages/vllm/v1/worker COPY vllm/patches/patch_debug_layers.py /tmp/patch_debug_layers.py RUN python3 /tmp/patch_debug_layers.py ${VLLM_WORKER_DIR}/gpu_model_runner.py && rm /tmp/patch_debug_layers.py # Register CuTeDSL kernel in vLLM's linear kernel selection ARG VLLM_LINEAR_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear COPY vllm/patches/register_cutedsl_kernel.py /tmp/register_cutedsl_kernel.py RUN python3 /tmp/register_cutedsl_kernel.py ${VLLM_LINEAR_DIR}/__init__.py && rm /tmp/register_cutedsl_kernel.py # Config patches (add cutedsl to MoEBackend) ARG VLLM_CONFIG_DIR=/usr/local/lib/python3.12/dist-packages/vllm/config COPY vllm/patches/kernel.py ${VLLM_CONFIG_DIR}/kernel.py # NVFP4 MoE backend registration COPY vllm/patches/fused_moe/oracle/nvfp4.py ${VLLM_FUSED_MOE_DIR}/oracle/nvfp4.py COPY vllm/patches/fused_moe/experts/cutedsl_moe.py ${VLLM_FUSED_MOE_DIR}/experts/cutedsl_moe.py # Register DeepseekV4ForCausalLM model architecture (if not already in upstream) RUN grep -q '"DeepseekV4ForCausalLM"' ${VLLM_MODELS_DIR}/registry.py || \ sed -i 's/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),\n "DeepseekV4ForCausalLM": ("deepseek_v4", "DeepseekV4ForCausalLM"),/' \ ${VLLM_MODELS_DIR}/registry.py # Verify RUN python3 -c "import torch; print(f'PyTorch {torch.__version__} OK')" && \ python3 -c "import vllm; print('vLLM OK')" && \ python3 -c "import nvfp4_megamoe_kernel; print('NVFP4 kernel OK')" && \ python3 -c "import cutlass; print('CuTeDSL OK')"