smollm3-3b-vllm/docker-compose.yaml

services:
  vllm:
    image: vllm/vllm-openai:v0.19.0
    pull_policy: always
    privileged: true
    environment:
      - HF_TOKEN=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
    command:
      - HuggingFaceTB/SmolLM3-3B
      - --host=0.0.0.0
      - --port=80
      - --chat-template-content-format=string
      - --chat-template=/root/chat_template.jinja
      - --enable-auto-tool-choice
      - --tool-call-parser=hermes
      - --reasoning-parser=deepseek_r1
      - --enable-lora
      - --lora-modules=smollm-toolcall=/root/loras/better-tool-call
      #- --max-model-len=131072
      #- --hf-overrides={"rope_scaling":{"type":"yarn","factor":2.0,"original_max_position_embeddings":65536}}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    ipc: host
    security_opt:
      - seccomp:unconfined
    tty: true
    stdin_open: true
    volumes:
       - /srv:/root/.cache/huggingface
       - ./chat_template.jinja:/root/chat_template.jinja
       - ./smol_tool_parser.py:/usr/local/lib/python3.12/dist-packages/vllm/tool_parsers/hermes_tool_parser.py
       - ./loras:/root/loras
    network_mode: host