services: vllm: image: vllm/vllm-openai:v0.19.0 pull_policy: always privileged: true environment: - HF_TOKEN=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 command: - HuggingFaceTB/SmolLM3-3B - --host=0.0.0.0 - --port=80 - --chat-template-content-format=string - --chat-template=/root/chat_template.jinja - --enable-auto-tool-choice - --tool-call-parser=hermes - --reasoning-parser=deepseek_r1 - --enable-lora - --lora-modules=smollm-toolcall=/root/loras/better-tool-call #- --max-model-len=131072 #- --hf-overrides={"rope_scaling":{"type":"yarn","factor":2.0,"original_max_position_embeddings":65536}} deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] ipc: host security_opt: - seccomp:unconfined tty: true stdin_open: true volumes: - /srv:/root/.cache/huggingface - ./chat_template.jinja:/root/chat_template.jinja - ./smol_tool_parser.py:/usr/local/lib/python3.12/dist-packages/vllm/tool_parsers/hermes_tool_parser.py - ./loras:/root/loras network_mode: host