diff --git a/build_push.sh b/build_push.sh index 31c9bcd..cb07c29 100644 --- a/build_push.sh +++ b/build_push.sh @@ -10,7 +10,7 @@ CR_URL="atl.vultrcr.com/vllm" CR_USER="9aa177f7-b83b-4f5b-9171-588871a1534d" CR_PASS="4rG45eDqAfAsce66nSmtk8UVVjXQVEfgJ292" IMAGE_TAG="${CR_URL}/vllm-dsv4-nvfp4:latest" -CACHE_BUSTER=${1:-1} +CACHE_BUSTER=69 COMPOSE_DIR="/root/nvidia-meeting" echo "============================================" diff --git a/docker-compose.yml b/docker-compose.yml index 9d72f49..f1b1da2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,29 +1,24 @@ services: vllm: - image: atl.vultrcr.com/vllm/vllm-with-lmcache:dream-build + image: atl.vultrcr.com/vllm/vllm-dsv4-nvfp4:latest pull_policy: always - entrypoint: - - bash - - -c - - | - cp /patches/deepseek_v4.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py - exec vllm serve "$$@" - - -- + ports: + - "8000:8000" environment: - HF_TOKEN=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 - OMP_NUM_THREADS=128 + - VLLM_USE_FLASHINFER_MOE_FP4=1 command: - /model - --trust-remote-code - --kv-cache-dtype=fp8 - - --block-size=256 + #- --block-size=256 - --enable-expert-parallel - --tensor-parallel-size=8 - - --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]} - - --attention_config.use_fp4_indexer_cache=True + #- --compilation-config={"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]} + #- --attention_config.use_fp4_indexer_cache=True - --tokenizer-mode=deepseek_v4 - - - --speculative_config={"method":"mtp","num_speculative_tokens":2} + #- --speculative_config={"method":"mtp","num_speculative_tokens":2} - --host=0.0.0.0 - --port=8000 deploy: @@ -33,14 +28,6 @@ services: - driver: nvidia count: all capabilities: [gpu] - ipc: host - security_opt: - - seccomp:unconfined - tty: true - stdin_open: true volumes: - /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro - - /root/nvidia-meeting/deepseek-v4-quant/patches/deepseek_v4.py:/patches/deepseek_v4.py:ro - - /root/nvidia-meeting/deepseek-v4-quant/patches:/patches:ro - - /srv/vllmcache:/root/.cache/vllm/deep_gemm/cache - network_mode: host +