diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 3c1028929..5b4a81d4f 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -521,7 +521,7 @@ First, launch the OpenAI-compatible server: ```bash vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \ - --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt.image 2 ``` Then, you can use the OpenAI client as follows: diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index ac02e9bde..7ff9531c5 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -658,7 +658,7 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model. !!! tip - For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache. + For hybrid-only models such as Llama-4, Step3, Mistral-3 and Qwen-3.5, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (`--language-model-only`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache. !!! note vLLM currently supports adding LoRA adapters to the language backbone for most multimodal models. Additionally, vLLM now experimentally supports adding LoRA to the tower and connector modules for some multimodal models. See [this page](../features/lora.md). diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 1f6e5ba14..0879b0dfa 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -18,11 +18,11 @@ from vllm.assets.image import ImageAsset # # Mistral format # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ # --tokenizer-mode mistral --config-format mistral --load-format mistral \ -# --limit-mm-per-prompt '{"image":4}' --max-model-len 16384 +# --limit-mm-per-prompt.image 4 --max-model-len 16384 # # # HF format # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ -# --limit-mm-per-prompt '{"image":4}' --max-model-len 16384 +# --limit-mm-per-prompt.image 4 --max-model-len 16384 # ``` # # - Client: diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 198863ae4..37f46b369 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -10,7 +10,7 @@ vllm serve llava-hf/llava-1.5-7b-hf (multi-image inference with Phi-3.5-vision-instruct) vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \ - --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt.image 2 (audio inference with Ultravox) vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \ diff --git a/examples/pooling/classify/vision_classification_online.py b/examples/pooling/classify/vision_classification_online.py index 64dc5d4ae..021d3dfe5 100644 --- a/examples/pooling/classify/vision_classification_online.py +++ b/examples/pooling/classify/vision_classification_online.py @@ -7,7 +7,7 @@ NOTE: vllm serve muziyongshixin/Qwen2.5-VL-7B-for-VideoCls \ --runner pooling \ --max-model-len 5000 \ - --limit-mm-per-prompt '{"video": 1}' \ + --limit-mm-per-prompt.video 1 \ --hf-overrides '{"text_config": {"architectures": ["Qwen2_5_VLForSequenceClassification"]}}' """ diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 68244ba2f..7a10783e8 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -55,12 +55,12 @@ class MultiModalConfig: """Controls the behavior of multimodal models.""" language_model_only: bool = False - """If True, disables all multimodal inputs by setting all modality limits - to 0. Equivalent to setting --limit-mm-per-prompt to 0 for every - modality.""" + """If True, disables all multimodal inputs by setting all modality limits to 0. + Equivalent to setting `--limit-mm-per-prompt` to 0 for every modality.""" limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict) """The maximum number of input items and options allowed per - prompt for each modality. + prompt for each modality. + Defaults to 999 for each modality. Legacy format (count only):