services: vllm: image: atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build pull_policy: always entrypoint: - bash - -c - | cp /patches/deepseek_v4.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py exec vllm serve "$$@" - -- environment: - HF_TOKEN=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 - OMP_NUM_THREADS=64 command: - /model - --trust-remote-code - --kv-cache-dtype=fp8 - --block-size=256 - --enable-expert-parallel - --tensor-parallel-size=8 - --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]} - --attention_config.use_fp4_indexer_cache=True - --tokenizer-mode=deepseek_v4 - --speculative_config={"method":"mtp","num_speculative_tokens":2} - --host=0.0.0.0 - --port=8000 deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] ipc: host security_opt: - seccomp:unconfined tty: true stdin_open: true volumes: - /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro - /root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py:/patches/deepseek_v4.py:ro - /root/nvidia-meeting/deepseek-v4-quant/patches:/patches:ro - /srv/vllmcache:/root/.cache/vllm/deep_gemm/cache network_mode: host