sync B200 deployment files: Dockerfile, docker-compose, patches
This commit is contained in:
@@ -1,30 +1,22 @@
|
||||
services:
|
||||
vllm:
|
||||
image: atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build
|
||||
pull_policy: always
|
||||
entrypoint:
|
||||
- bash
|
||||
- -c
|
||||
- |
|
||||
cp /patches/deepseek_v4.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py
|
||||
exec vllm serve "$$@"
|
||||
- --
|
||||
build:
|
||||
context: .
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
- HF_TOKEN=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
|
||||
- OMP_NUM_THREADS=128
|
||||
- MEGA_MOE_DEBUG=1
|
||||
- MEGA_MOE_STATIC=0
|
||||
- MEGA_MOE_USE_CUTLASS=1
|
||||
- DG_JIT_DEBUG=1
|
||||
command:
|
||||
- /model
|
||||
- --trust-remote-code
|
||||
- --kv-cache-dtype=fp8
|
||||
- --block-size=256
|
||||
- --enable-expert-parallel
|
||||
- --tensor-parallel-size=8
|
||||
- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
|
||||
- --attention_config.use_fp4_indexer_cache=True
|
||||
- --enforce-eager
|
||||
- --tokenizer-mode=deepseek_v4
|
||||
- --tool-call-parser=deepseek_v4
|
||||
- --enable-auto-tool-choice
|
||||
- --reasoning-parser=deepseek_v4
|
||||
- --speculative_config={"method":"mtp","num_speculative_tokens":2}
|
||||
- --host=0.0.0.0
|
||||
- --port=8000
|
||||
deploy:
|
||||
@@ -34,13 +26,5 @@ services:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
ipc: host
|
||||
security_opt:
|
||||
- seccomp:unconfined
|
||||
tty: true
|
||||
stdin_open: true
|
||||
volumes:
|
||||
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
|
||||
- /root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py:/patches/deepseek_v4.py:ro
|
||||
- /root/nvidia-meeting/deepseek-v4-quant/patches:/patches:ro
|
||||
network_mode: host
|
||||
|
||||
Reference in New Issue
Block a user