feat: add Dockerfile + build/push script for NVFP4 container

- Extends dream-build with DeepGEMM nvfp4-mega-moe kernel - build_push.sh: builds, logs into Vultr CR, pushes, updates docker-compose - CACHE_BUSTER parameter for forcing fresh clones
2026-05-11 05:57:49 +00:00
parent e963325b61
commit c234190a80
2 changed files with 93 additions and 0 deletions
--- a/24
+++ b/24
@@ -0,0 +1,24 @@
+# DeepSeek V4 NVFP4 vLLM + DeepGEMM Mega MoE
+# Extends the vLLM dream-build container with our custom DeepGEMM kernel
+# and DeepSeek V4 patch.
+
+FROM atl.vultrcr.io/vllm/vllm-with-lmcache:dream-build
+
+# Install build essentials
+RUN apt-get update && apt-get install -y git screen cmake && rm -rf /var/lib/apt/lists/*
+
+# Clone and build DeepGEMM with NVFP4 mega_moe kernel
+# CACHE_BUSTER: increment to force fresh clone
+RUN git clone -b nvfp4-mega-moe https://sweetapi.com/biondizzle/DeepGEMM.git /root/DeepGEMM && CACHE_BUSTER=1
+
+# Build DeepGEMM (CUTLASS/CuTe headers come from flashinfer/vllm deps)
+ENV CPATH="/usr/local/lib/python3.12/dist-packages/flashinfer/data/cutlass/include:/usr/local/lib/python3.12/dist-packages/nvidia/cu13/include:${CPATH}"
+RUN cd /root/DeepGEMM && python3 setup.py build_ext --inplace
+
+# Copy our DeepSeek V4 patch into the image (will be applied at entrypoint)
+# The actual patch file is mounted at runtime, but we stage a default
+COPY patches/deepseek_v4.py /defaults/deepseek_v4.py
+
+# Verify everything imports
+RUN python3 -c "import deep_gemm; print('DeepGEMM NVFP4 OK')" && \
+    python3 -c "import vllm; print('vLLM OK')"
--- a/build_push.sh
+++ b/build_push.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Build and push the DeepSeek V4 NVFP4 container
+# Usage: bash build_push.sh [CACHE_BUSTER_VALUE]
+# Always run in screen: screen -S build bash build_push.sh
+
+set -e
+
+# Config
+CR_URL="atl.vultrcr.com/vllm"
+CR_USER="9aa177f7-b83b-4f5b-9171-588871a1534d"
+CR_PASS="4rG45eDqAfAsce66nSmtk8UVVjXQVEfgJ292"
+IMAGE_TAG="${CR_URL}/vllm-dsv4-nvfp4:latest"
+CACHE_BUSTER=${1:-1}
+COMPOSE_DIR="/root/nvidia-meeting"
+
+echo "============================================"
+echo "DeepSeek V4 NVFP4 Container Build"
+echo "CACHE_BUSTER=$CACHE_BUSTER"
+echo "Image: $IMAGE_TAG"
+echo "============================================"
+
+# Inject CACHE_BUSTER into Dockerfile
+cd "$(dirname "$0")"
+# Update the CACHE_BUSTER line in Dockerfile
+sed -i "s/CACHE_BUSTER=.*/CACHE_BUSTER=${CACHE_BUSTER}/" Dockerfile
+
+echo "[1/4] Building container..."
+docker build -t "$IMAGE_TAG" .
+
+echo "[2/4] Logging into container registry..."
+echo "$CR_PASS" | docker login "$CR_URL" -u "$CR_USER" --password-stdin
+
+echo "[3/4] Pushing image..."
+docker push "$IMAGE_TAG"
+
+echo "[4/4] Updating docker-compose..."
+cat > "${COMPOSE_DIR}/docker-compose.yml" << 'EOF'
+services:
+  vllm:
+    image: atl.vultrcr.com/vllm/vllm-dsv4-nvfp4:latest
+    container_name: nvidia-meeting-vllm-1
+    ports:
+      - "8000:8000"
+    volumes:
+      - /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model
+      - /root/nvidia-meeting/patches:/patches
+    environment:
+      - VLLM_USE_FLASHINFER_MOE_FP4=1
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 8
+              capabilities: [gpu]
+    command: >
+      --model /model
+      --tensor-parallel-size 8
+      --max-model-len 65536
+      --trust-remote-code
+      --enforce-eager
+      --kv-cache-dtype fp8
+      --port 8000
+EOF
+
+echo "============================================"
+echo "DONE! Container pushed to $IMAGE_TAG"
+echo "Start with: cd $COMPOSE_DIR && docker compose up -d"
+echo "============================================"