nvfp4-megamoe-kernel/docker-compose.yml

services:
  vllm:
    build:
      context: .
      dockerfile: Dockerfile
    ports:
      - "8000:8000"
    environment:
      - OMP_NUM_THREADS=128
      - CUDA_LAUNCH_BLOCKING=0
      - PYTHONUNBUFFERED=1
      - VLLM_RPC_TIMEOUT_MS=600000
      - CLAWMINE_NAN_CHECK=1
    command:
      - /model
      - --trust-remote-code
      - --enable-expert-parallel
      - --tensor-parallel-size=8
      #- --enforce-eager
      - --compilation-config
      #- '{"cudagraph_mode": "NONE", "custom_ops": ["all"]}'
      - '{"cudagraph_mode": "FULL_DECODE_ONLY", "custom_ops": ["all"], "cudagraph_capture_sizes": [1, 2, 4, 8], "max_cudagraph_capture_size": 8}' # This is what is runing right now
      #- '{"cudagraph_mode":"FULL_AND_PIECEWISE", "custom_ops":["all"]}'
      #- --moe-backend=deep_gemm_mega_moe
      - --tokenizer-mode=deepseek_v4
      #- --attention_config.use_fp4_indexer_cache=True
      - --tool-call-parser=deepseek_v4
      - --enable-auto-tool-choice
      - --reasoning-parser=deepseek_v4
      - --gpu_memory_utilization=0.9
      - --host=0.0.0.0
      - --port=8000
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    volumes:
      - /root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4:/model:ro