- CuTeDSLNvfp4Method: custom quant method that creates CuTeDSL runners during process_weights_after_loading, then swaps to CuTeDSLNvfp4LinearMethod for forward dispatch - Attention projections (fused_wqa_wkv, wq_b, wo_b) now route through CuTeDSLNvfp4Linear (cosine 0.992-0.996 vs BF16 reference) - Shared expert now uses CuTeDSLSharedExpertRunner (cosine 0.992 vs BF16) with monkey-patched forward for fused L1+SiLU+L2 pipeline - Deleted all BF16 dequant code (_dequant_nvfp4_to_bf16, _post_quant_fix, input_scale fixes) - Deleted _post_quant_fix hook from utils.py - Fixed SwiGLU clamp: gate clamped BEFORE SiLU (matching SiluAndMulWithClamp) - Cleaned up all debug prints - Updated Dockerfile with new kernel files
51 lines
2.5 KiB
Docker
51 lines
2.5 KiB
Docker
# DeepSeek V4 NVFP4 vLLM + CuTeDSL NVFP4 MoE Kernel
|
|
FROM vllm/vllm-openai:nightly-x86_64
|
|
|
|
# Remove broken nixl_ep (built against CUDA 12, image is CUDA 13)
|
|
RUN pip uninstall -y nixl-ep; rm -rf /usr/local/lib/python3.12/dist-packages/nixl_ep
|
|
|
|
RUN apt-get update && apt-get install -y git screen cmake libcusolver-dev-13-0 libcusparse-dev-13-0 libcublas-dev-13-0 libcurand-dev-13-0 libcufft-dev-13-0 libnvjitlink-dev-13-0 && rm -rf /var/lib/apt/lists/*
|
|
|
|
# Remove the broken symlink if it exists
|
|
RUN rm -f /usr/local/cuda/lib64/libcudrt.so.12
|
|
|
|
ENV CUDA_HOME=/usr/local/cuda
|
|
ENV TORCH_CUDA_ARCH_LIST="10.0"
|
|
|
|
# Install CuTeDSL (NVFP4 block-scaled GEMM kernel framework)
|
|
RUN pip install nvidia-cutlass-dsl==4.5.0 nvidia-cutlass-dsl-libs-base==4.5.0
|
|
|
|
ARG CACHE_BUSTER=${TIMESTAMP}
|
|
|
|
# Copy the NVFP4 mega_moe Python kernel (no C++ build needed)
|
|
COPY src/ /root/nvfp4-megamoe-kernel/src/
|
|
COPY pyproject.toml /root/nvfp4-megamoe-kernel/pyproject.toml
|
|
RUN cd /root/nvfp4-megamoe-kernel && pip install -e .
|
|
|
|
# Copy the CuTeDSL kernel and bridge layer
|
|
COPY cutedsl/ /root/nvfp4-megamoe-kernel/cutedsl/
|
|
|
|
ENV PYTHONPATH="/root/nvfp4-megamoe-kernel:${PYTHONPATH}"
|
|
|
|
# Patch vLLM — overwrite model files and register architecture
|
|
ARG VLLM_MODELS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models
|
|
ARG VLLM_LAYERS_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers
|
|
ARG VLLM_LOADER_DIR=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader
|
|
|
|
COPY vllm/patches/deepseek_v4.py ${VLLM_MODELS_DIR}/deepseek_v4.py
|
|
COPY vllm/patches/deepseek_v4_attention.py ${VLLM_LAYERS_DIR}/deepseek_v4_attention.py
|
|
COPY vllm/nvfp4_cutedsl.py ${VLLM_MODELS_DIR}/nvfp4_cutedsl.py
|
|
COPY vllm/cutedsl_quant_method.py ${VLLM_MODELS_DIR}/cutedsl_quant_method.py
|
|
COPY cutedsl/nvfp4_linear.py /root/nvfp4-megamoe-kernel/cutedsl/nvfp4_linear.py
|
|
COPY cutedsl/shared_expert_pipeline.py /root/nvfp4-megamoe-kernel/cutedsl/shared_expert_pipeline.py
|
|
COPY vllm/patches/utils.py ${VLLM_LOADER_DIR}/utils.py
|
|
|
|
RUN sed -i 's/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),/"DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),\n "DeepseekV4ForCausalLM": ("deepseek_v4", "DeepseekV4ForCausalLM"),/' \
|
|
${VLLM_MODELS_DIR}/registry.py
|
|
|
|
# Verify
|
|
RUN python3 -c "import torch; print(f'PyTorch {torch.__version__} OK')" && \
|
|
python3 -c "import vllm; print('vLLM OK')" && \
|
|
python3 -c "import nvfp4_megamoe_kernel; print('NVFP4 kernel OK')" && \
|
|
python3 -c "import cutlass; print('CuTeDSL OK')"
|