1. DeepseekV4MLAAttention.__init__ had a hard assertion that the attention backend MUST be FlashMLA. On Blackwell, FlashMLA doesn't work but we bypass it via _attention_impl_blackwell(). Added _is_blackwell flag to skip FlashMLA-specific init (fp8_ds_mla cache format conversion). 2. Added VLLM_NVFP4_GEMM_BACKEND=cutedsl env var to docker-compose.yml to force CuTeDSL kernel selection for NVFP4 linear layers. 3. Updated register_cutedsl_kernel.py to also register CuTeDSL in _NVFP4_BACKEND_TO_KERNEL dict (for the env var override path).
39 lines
1.0 KiB
YAML
39 lines
1.0 KiB
YAML
services:
|
|
vllm:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
ports:
|
|
- "8000:8000"
|
|
environment:
|
|
- OMP_NUM_THREADS=128
|
|
- CUDA_LAUNCH_BLOCKING=0
|
|
- PYTHONUNBUFFERED=1
|
|
- VLLM_RPC_TIMEOUT_MS=600000
|
|
- CLAWMINE_DEBUG=1
|
|
- VLLM_NVFP4_GEMM_BACKEND=cutedsl
|
|
command:
|
|
- /model
|
|
- --trust-remote-code
|
|
- --enable-expert-parallel
|
|
- --tensor-parallel-size=8
|
|
- --compilation-config={"cudagraph_mode":"NONE","custom_ops":["all"]}
|
|
- --tokenizer-mode=deepseek_v4
|
|
- --tool-call-parser=deepseek_v4
|
|
- --enable-auto-tool-choice
|
|
- --reasoning-parser=deepseek_v4
|
|
- --moe-backend=cutedsl
|
|
- --gpu-memory-utilization=0.9
|
|
- --max-model-len=876544
|
|
- --host=0.0.0.0
|
|
- --port=8000
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
volumes:
|
|
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
|