Files
deepseek-v4-quant/docker-compose.yml
2026-05-12 21:28:50 +00:00

34 lines
976 B
YAML

services:
vllm:
image: atl.vultrcr.com/vllm/vllm-dsv4-nvfp4:latest
pull_policy: always
ports:
- "8000:8000"
environment:
- HF_TOKEN=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
- OMP_NUM_THREADS=128
- VLLM_USE_FLASHINFER_MOE_FP4=1
command:
- /model
- --trust-remote-code
- --kv-cache-dtype=fp8
#- --block-size=256
- --enable-expert-parallel
- --tensor-parallel-size=8
#- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
#- --attention_config.use_fp4_indexer_cache=True
- --tokenizer-mode=deepseek_v4
#- --speculative_config={"method":"mtp","num_speculative_tokens":2}
- --host=0.0.0.0
- --port=8000
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
volumes:
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro