services: vllm: build: context: . dockerfile: Dockerfile ports: - "8000:8000" environment: - OMP_NUM_THREADS=128 - CUDA_LAUNCH_BLOCKING=0 - PYTHONUNBUFFERED=1 - VLLM_RPC_TIMEOUT_MS=600000 - CLAWMINE_DEBUG=1 # Don't set VLLM_NVFP4_GEMM_BACKEND - our CuTeDSL kernel auto-selects on Blackwell command: - /model - --trust-remote-code - --enable-expert-parallel - --tensor-parallel-size=8 - --compilation-config={"cudagraph_mode":"NONE","custom_ops":["all"]} - --tokenizer-mode=deepseek_v4 - --tool-call-parser=deepseek_v4 - --enable-auto-tool-choice - --reasoning-parser=deepseek_v4 - --moe-backend=cutedsl - --gpu-memory-utilization=0.9 - --max-model-len=256 - --host=0.0.0.0 - --port=8000 deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] volumes: - /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro