# DeepSeek V4 NVFP4 vLLM + CuTeDSL NVFP4 MoE Kernel FROM vllm/vllm-openai:nightly-x86_64 # Remove broken nixl_ep (built against CUDA 12, image is CUDA 13) RUN pip uninstall -y nixl-ep; rm -rf /usr/local/lib/python3.12/dist-packages/nixl_ep RUN apt-get update && apt-get install -y git screen cmake libcusolver-dev-13-0 libcusparse-dev-13-0 libcublas-dev-13-0 libcurand-dev-13-0 libcufft-dev-13-0 libnvjitlink-dev-13-0 && rm -rf /var/lib/apt/lists/* # Remove the broken symlink if it exists RUN rm -f /usr/local/cuda/lib64/libcudrt.so.12 ENV CUDA_HOME=/usr/local/cuda ENV TORCH_CUDA_ARCH_LIST="10.0" # Install CuTeDSL (NVFP4 block-scaled GEMM kernel framework) RUN pip install nvidia-cutlass-dsl==4.5.0 nvidia-cutlass-dsl-libs-base==4.5.0 ARG CACHE_BUSTER=${TIMESTAMP} # Copy the NVFP4 mega_moe Python kernel (no C++ build needed) COPY src/ /root/nvfp4-megamoe-kernel/src/ COPY pyproject.toml /root/nvfp4-megamoe-kernel/pyproject.toml RUN cd /root/nvfp4-megamoe-kernel && pip install -e . # Copy the CuTeDSL kernel and bridge layer COPY cutedsl/ /root/nvfp4-megamoe-kernel/cutedsl/ ENV PYTHONPATH="/root/nvfp4-megamoe-kernel:${PYTHONPATH}"