36 lines
1005 B
YAML
36 lines
1005 B
YAML
services:
|
|
vllm:
|
|
#image: atl.vultrcr.com/vllm/vllm-dsv4-nvfp4:latest
|
|
build:
|
|
context: .
|
|
pull_policy: always
|
|
ports:
|
|
- "8000:8000"
|
|
environment:
|
|
- HF_TOKEN=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
|
|
- OMP_NUM_THREADS=128
|
|
- VLLM_USE_FLASHINFER_MOE_FP4=1
|
|
command:
|
|
- /model
|
|
- --trust-remote-code
|
|
- --kv-cache-dtype=fp8
|
|
#- --block-size=256
|
|
- --enable-expert-parallel
|
|
- --tensor-parallel-size=8
|
|
#- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
|
|
#- --attention_config.use_fp4_indexer_cache=True
|
|
- --tokenizer-mode=deepseek_v4
|
|
#- --speculative_config={"method":"mtp","num_speculative_tokens":2}
|
|
- --host=0.0.0.0
|
|
- --port=8000
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
volumes:
|
|
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
|
|
|