[Core] Support image processor (#4197)
This commit is contained in:
@@ -90,6 +90,7 @@ autodoc_mock_imports = [
|
||||
"sentencepiece",
|
||||
"vllm.cuda_utils",
|
||||
"vllm._C",
|
||||
"PIL",
|
||||
"numpy",
|
||||
"tqdm",
|
||||
"tensorizer",
|
||||
@@ -116,12 +117,13 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
|
||||
autodoc.ClassDocumenter = MockedClassDocumenter
|
||||
|
||||
intersphinx_mapping = {
|
||||
'python': ('https://docs.python.org/3', None),
|
||||
'typing_extensions':
|
||||
('https://typing-extensions.readthedocs.io/en/latest', None),
|
||||
'numpy': ('https://numpy.org/doc/stable', None),
|
||||
'torch': ('https://pytorch.org/docs/stable', None),
|
||||
'psutil': ('https://psutil.readthedocs.io/en/stable', None),
|
||||
"python": ("https://docs.python.org/3", None),
|
||||
"typing_extensions":
|
||||
("https://typing-extensions.readthedocs.io/en/latest", None),
|
||||
"pillow": ("https://pillow.readthedocs.io/en/stable", None),
|
||||
"numpy": ("https://numpy.org/doc/stable", None),
|
||||
"torch": ("https://pytorch.org/docs/stable", None),
|
||||
"psutil": ("https://psutil.readthedocs.io/en/stable", None),
|
||||
}
|
||||
|
||||
autodoc_preserve_defaults = True
|
||||
|
||||
51
docs/source/dev/multimodal/multimodal_index.rst
Normal file
51
docs/source/dev/multimodal/multimodal_index.rst
Normal file
@@ -0,0 +1,51 @@
|
||||
Multi-Modality
|
||||
==============
|
||||
|
||||
.. currentmodule:: vllm.multimodal
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
|
||||
|
||||
:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
|
||||
which allows you to pass in multi-modal input alongside text and token prompts.
|
||||
|
||||
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
|
||||
you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
|
||||
as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
|
||||
|
||||
.. contents::
|
||||
:local:
|
||||
:backlinks: none
|
||||
|
||||
Module Contents
|
||||
+++++++++++++++
|
||||
|
||||
.. automodule:: vllm.multimodal
|
||||
|
||||
Registry
|
||||
--------
|
||||
|
||||
.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
|
||||
|
||||
The global :class:`MultiModalRegistry` which is used by model runners.
|
||||
|
||||
.. autoclass:: vllm.multimodal.MultiModalRegistry
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
Base Classes
|
||||
------------
|
||||
|
||||
.. autoclass:: vllm.multimodal.MultiModalData
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
.. autoclass:: vllm.multimodal.MultiModalPlugin
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
Image Classes
|
||||
-------------
|
||||
|
||||
.. automodule:: vllm.multimodal.image
|
||||
:members:
|
||||
:show-inheritance:
|
||||
@@ -88,6 +88,7 @@ Documentation
|
||||
models/adding_model
|
||||
models/engine_args
|
||||
models/lora
|
||||
models/vlm
|
||||
models/performance
|
||||
|
||||
.. toctree::
|
||||
@@ -99,17 +100,18 @@ Documentation
|
||||
quantization/fp8_e4m3_kvcache
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:maxdepth: 1
|
||||
:caption: Developer Documentation
|
||||
|
||||
dev/sampling_params
|
||||
dev/offline_inference/offline_index
|
||||
dev/engine/engine_index
|
||||
dev/kernel/paged_attention
|
||||
dev/multimodal/multimodal_index
|
||||
dev/dockerfile/dockerfile
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:maxdepth: 1
|
||||
:caption: Community
|
||||
|
||||
community/meetups
|
||||
|
||||
@@ -87,6 +87,10 @@ Alongside each architecture, we include some popular models that use it.
|
||||
- LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
|
||||
- :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
|
||||
- ✅︎
|
||||
* - :code:`LlavaForConditionalGeneration`
|
||||
- LLaVA-1.5
|
||||
- :code:`llava-hf/llava-1.5-7b-hf`\*, :code:`llava-hf/llava-1.5-13b-hf`\*, etc.
|
||||
-
|
||||
* - :code:`MiniCPMForCausalLM`
|
||||
- MiniCPM
|
||||
- :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
|
||||
|
||||
56
docs/source/models/vlm.rst
Normal file
56
docs/source/models/vlm.rst
Normal file
@@ -0,0 +1,56 @@
|
||||
.. _vlm:
|
||||
|
||||
Using VLMs
|
||||
==========
|
||||
|
||||
This document shows you how to run and serve Vision Language Models (VLMs) using vLLM.
|
||||
|
||||
Engine Arguments
|
||||
----------------
|
||||
|
||||
The following :ref:`engine arguments <engine_args>` are specific to VLMs:
|
||||
|
||||
.. argparse::
|
||||
:module: vllm.engine.arg_utils
|
||||
:func: _vlm_engine_args_parser
|
||||
:prog: -m vllm.entrypoints.openai.api_server
|
||||
:nodefaultconst:
|
||||
|
||||
Offline Batched Inference
|
||||
-------------------------
|
||||
|
||||
To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
image_input_type="pixel_values",
|
||||
image_token_id=32000,
|
||||
image_input_shape="1,3,336,336",
|
||||
image_feature_size=576,
|
||||
)
|
||||
|
||||
For now, we only support a single image per text prompt. To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
|
||||
|
||||
* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
|
||||
* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
prompt = "<image>" * 576 + (
|
||||
"\nUSER: What is the content of this image?\nASSISTANT:")
|
||||
|
||||
# Load the image using PIL.Image
|
||||
image = ...
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": ImagePixelData(image),
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
|
||||
Reference in New Issue
Block a user