clanker nonsense

This commit is contained in:
2026-05-12 21:28:50 +00:00
parent d88ea9842b
commit 25a2d4e6ad
2 changed files with 10 additions and 23 deletions

View File

@@ -10,7 +10,7 @@ CR_URL="atl.vultrcr.com/vllm"
CR_USER="9aa177f7-b83b-4f5b-9171-588871a1534d"
CR_PASS="4rG45eDqAfAsce66nSmtk8UVVjXQVEfgJ292"
IMAGE_TAG="${CR_URL}/vllm-dsv4-nvfp4:latest"
CACHE_BUSTER=${1:-1}
CACHE_BUSTER=69
COMPOSE_DIR="/root/nvidia-meeting"
echo "============================================"

View File

@@ -1,29 +1,24 @@
services:
vllm:
image: atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build
image: atl.vultrcr.com/vllm/vllm-dsv4-nvfp4:latest
pull_policy: always
entrypoint:
- bash
- -c
- |
cp /patches/deepseek_v4.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py
exec vllm serve "$$@"
- --
ports:
- "8000:8000"
environment:
- HF_TOKEN=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
- OMP_NUM_THREADS=128
- VLLM_USE_FLASHINFER_MOE_FP4=1
command:
- /model
- --trust-remote-code
- --kv-cache-dtype=fp8
- --block-size=256
#- --block-size=256
- --enable-expert-parallel
- --tensor-parallel-size=8
- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
- --attention_config.use_fp4_indexer_cache=True
#- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}
#- --attention_config.use_fp4_indexer_cache=True
- --tokenizer-mode=deepseek_v4
- --speculative_config={"method":"mtp","num_speculative_tokens":2}
#- --speculative_config={"method":"mtp","num_speculative_tokens":2}
- --host=0.0.0.0
- --port=8000
deploy:
@@ -33,14 +28,6 @@ services:
- driver: nvidia
count: all
capabilities: [gpu]
ipc: host
security_opt:
- seccomp:unconfined
tty: true
stdin_open: true
volumes:
- /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro
- /root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py:/patches/deepseek_v4.py:ro
- /root/nvidia-meeting/deepseek-v4-quant/patches:/patches:ro
- /srv/vllmcache:/root/.cache/vllm/deep_gemm/cache
network_mode: host