Files
nvfp4-megamoe-kernel/docker-compose.yml
2026-05-17 07:14:58 +00:00

39 lines
1.1 KiB
YAML

services:
vllm:
build:
context: .
dockerfile: Dockerfile
ports:
- "8000:8000"
environment:
- OMP_NUM_THREADS=128
- CUDA_LAUNCH_BLOCKING=0
- PYTHONUNBUFFERED=1
- VLLM_RPC_TIMEOUT_MS=600000
command:
- /model
- --trust-remote-code
- --enable-expert-parallel
- --tensor-parallel-size=8
#- --enforce-eager
- --compilation-config
#- '{"cudagraph_mode": "FULL_DECODE_ONLY", "custom_ops": ["all"], "cudagraph_capture_sizes": [1, 2, 4, 8], "max_cudagraph_capture_size": 8}'
- '{"cudagraph_mode":"FULL_AND_PIECEWISE", "custom_ops":["all"]}'
#- --moe-backend=deep_gemm_mega_moe
- --tokenizer-mode=deepseek_v4
#- --attention_config.use_fp4_indexer_cache=True
- --tool-call-parser=deepseek_v4
- --enable-auto-tool-choice
- --reasoning-parser=deepseek_v4
- --host=0.0.0.0
- --port=8000
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
volumes:
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro