From f9e2a383869c56a1fbee048afc9501ced9194c7e Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 19 Mar 2026 19:25:47 +0800 Subject: [PATCH] [Docs] Reorganize pooling docs. (#35592) Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Cyrus Leung Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/CODEOWNERS | 1 + docs/.nav.yml | 2 +- docs/contributing/model/tests.md | 2 +- docs/features/README.md | 6 +- docs/mkdocs/hooks/url_schemes.py | 134 ++- docs/models/pooling_models.md | 716 ---------------- docs/models/pooling_models/README.md | 253 ++++++ docs/models/pooling_models/classify.md | 276 ++++++ docs/models/pooling_models/embed.md | 546 ++++++++++++ docs/models/pooling_models/reward.md | 136 +++ docs/models/pooling_models/scoring.md | 448 ++++++++++ docs/models/pooling_models/specific_models.md | 395 +++++++++ docs/models/pooling_models/token_classify.md | 89 ++ docs/models/pooling_models/token_embed.md | 125 +++ docs/models/supported_models.md | 212 +---- docs/serving/offline_inference.md | 2 +- docs/serving/openai_compatible_server.md | 786 +----------------- 17 files changed, 2393 insertions(+), 1736 deletions(-) delete mode 100644 docs/models/pooling_models.md create mode 100644 docs/models/pooling_models/README.md create mode 100644 docs/models/pooling_models/classify.md create mode 100644 docs/models/pooling_models/embed.md create mode 100644 docs/models/pooling_models/reward.md create mode 100644 docs/models/pooling_models/scoring.md create mode 100644 docs/models/pooling_models/specific_models.md create mode 100644 docs/models/pooling_models/token_classify.md create mode 100644 docs/models/pooling_models/token_embed.md diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 653d6c42e..b0e494327 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -171,6 +171,7 @@ mkdocs.yaml @hmellor # Pooling models /examples/pooling @noooop +/docs/models/pooling_models @noooop /tests/models/*/pooling* @noooop /tests/entrypoints/pooling @noooop /vllm/config/pooler.py @noooop diff --git a/docs/.nav.yml b/docs/.nav.yml index 835cc773e..89584442e 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -25,7 +25,7 @@ nav: - Models: - models/supported_models.md - models/generative_models.md - - models/pooling_models.md + - Pooling Models: models/pooling_models - models/extensions - Hardware Supported Models: - models/hardware_supported_models/* diff --git a/docs/contributing/model/tests.md b/docs/contributing/model/tests.md index 3ccd90cc6..92ce0170c 100644 --- a/docs/contributing/model/tests.md +++ b/docs/contributing/model/tests.md @@ -37,7 +37,7 @@ For [generative models](../../models/generative_models.md), there are two levels #### Pooling models -For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py). +For [pooling models](../../models/pooling_models/README.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py). ### Multi-modal processing diff --git a/docs/features/README.md b/docs/features/README.md index 6c10cf100..e62d9cdde 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -36,14 +36,14 @@ th:not(:first-child) { } -| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models.md) | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | [prompt-embeds](prompt_embeds.md) | +| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models/README.md) | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | [prompt-embeds](prompt_embeds.md) | | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | | [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | | | [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | | | [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | | | [SD](speculative_decoding/README.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | -| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | +| [pooling](../models/pooling_models/README.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | | enc-dec | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ✅ | ✅ | ✅ | | | | | | | | | | logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | | @@ -66,7 +66,7 @@ th:not(:first-child) { | [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [SD](speculative_decoding/README.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/26970) | -| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [pooling](../models/pooling_models/README.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | | [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py index 66fa25d2a..4d5034990 100644 --- a/docs/mkdocs/hooks/url_schemes.py +++ b/docs/mkdocs/hooks/url_schemes.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -MkDocs hook to enable the following links to render correctly: +MkDocs hook + markdown extension to enable the following links to render correctly, +including inside content included via pymdownx.snippets: - Relative file links outside of the `docs/` directory, e.g.: - [Text](../some_file.py) @@ -12,13 +13,17 @@ MkDocs hook to enable the following links to render correctly: e.g. <...pull/123> -> [Pull Request #123](.../pull/123) - Works for external repos too by including the `owner/repo` in the link title -The goal is to simplify cross-referencing common GitHub resources -in project docs. +The link replacement runs as a markdown preprocessor (priority 25) so that it executes +after pymdownx.snippets (priority 32) has expanded all included content. +The on_page_markdown hook passes the current page context to the preprocessor before +each page is converted. """ from pathlib import Path import regex as re +from markdown import Extension +from markdown.preprocessors import Preprocessor from mkdocs.config.defaults import MkDocsConfig from mkdocs.structure.files import Files from mkdocs.structure.pages import Page @@ -26,7 +31,6 @@ from mkdocs.structure.pages import Page ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve() DOC_DIR = ROOT_DIR / "docs" - gh_icon = ":octicons-mark-github-16:" # Regex pieces @@ -48,46 +52,90 @@ github_link = re.compile(rf"(\[{TITLE}\]\(|<){URL}(\)|>)") relative_link = re.compile(rf"\[{TITLE}\]\({RELATIVE}\)") +class UrlSchemesPreprocessor(Preprocessor): + """Preprocessor that runs after pymdownx.snippets to process all links.""" + + def __init__(self, md, ext): + super().__init__(md) + self.ext = ext + + def run(self, lines): + page = self.ext.page + if page is None or getattr(page.file, "abs_src_path", None) is None: + return lines + + def replace_relative_link(match: re.Match) -> str: + """ + Replace relative file links with URLs if they point outside the docs dir. + """ + title = match.group("title") + path = match.group("path") + path = (Path(page.file.abs_src_path).parent / path).resolve() + fragment = match.group("fragment") or "" + + # Check if the path exists and is outside the docs dir + if not path.exists() or path.is_relative_to(DOC_DIR): + return match.group(0) + + # Files and directories have different URL schemes on GitHub + slug = "tree/main" if path.is_dir() else "blob/main" + + path = path.relative_to(ROOT_DIR) + url = f"https://github.com/vllm-project/vllm/{slug}/{path}{fragment}" + return f"[{gh_icon} {title}]({url})" + + def replace_github_link(match: re.Match) -> str: + """ + Replace GitHub issue, PR, and project links with enhanced Markdown links. + """ + repo = match.group("repo") + type = match.group("type") + number = match.group("number") + # Title and fragment could be None + title = match.group("title") or "" + fragment = match.group("fragment") or "" + + # Use default titles for raw links + if not title: + title = TITLES[type] + if "vllm-project" not in repo: + title += repo + title += f"#{number}" + + url = f"https://github.com/{repo}/{type}/{number}{fragment}" + return f"[{gh_icon} {title}]({url})" + + markdown = "\n".join(lines) + markdown = relative_link.sub(replace_relative_link, markdown) + markdown = github_link.sub(replace_github_link, markdown) + return markdown.split("\n") + + +class UrlSchemesExtension(Extension): + """Markdown extension that registers the URL schemes preprocessor.""" + + def __init__(self, **kwargs): + self.page = None + super().__init__(**kwargs) + + def extendMarkdown(self, md): + # Priority 25 runs after pymdownx.snippets (priority 32) + md.preprocessors.register(UrlSchemesPreprocessor(md, self), "url_schemes", 25) + + +# Singleton extension instance shared between the hook and the preprocessor. +_ext = UrlSchemesExtension() + + +def on_config(config: MkDocsConfig) -> MkDocsConfig: + """Register the URL schemes markdown extension.""" + config["markdown_extensions"].append(_ext) + return config + + def on_page_markdown( markdown: str, *, page: Page, config: MkDocsConfig, files: Files ) -> str: - def replace_relative_link(match: re.Match) -> str: - """Replace relative file links with URLs if they point outside the docs dir.""" - title = match.group("title") - path = match.group("path") - path = (Path(page.file.abs_src_path).parent / path).resolve() - fragment = match.group("fragment") or "" - - # Check if the path exists and is outside the docs dir - if not path.exists() or path.is_relative_to(DOC_DIR): - return match.group(0) - - # Files and directories have different URL schemes on GitHub - slug = "tree/main" if path.is_dir() else "blob/main" - - path = path.relative_to(ROOT_DIR) - url = f"https://github.com/vllm-project/vllm/{slug}/{path}{fragment}" - return f"[{gh_icon} {title}]({url})" - - def replace_github_link(match: re.Match) -> str: - """Replace GitHub issue, PR, and project links with enhanced Markdown links.""" - repo = match.group("repo") - type = match.group("type") - number = match.group("number") - # Title and fragment could be None - title = match.group("title") or "" - fragment = match.group("fragment") or "" - - # Use default titles for raw links - if not title: - title = TITLES[type] - if "vllm-project" not in repo: - title += repo - title += f"#{number}" - - url = f"https://github.com/{repo}/{type}/{number}{fragment}" - return f"[{gh_icon} {title}]({url})" - - markdown = relative_link.sub(replace_relative_link, markdown) - markdown = github_link.sub(replace_github_link, markdown) + """Pass the current page context to the preprocessor.""" + _ext.page = page return markdown diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md deleted file mode 100644 index 9081b5e82..000000000 --- a/docs/models/pooling_models.md +++ /dev/null @@ -1,716 +0,0 @@ -# Pooling Models - -vLLM also supports pooling models, such as embedding, classification, and reward models. - -In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface. -These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input -before returning them. - -!!! note - We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly. - - We plan to optimize pooling models in vLLM. Please comment on if you have any suggestions! - -## Configuration - -### Model Runner - -Run a model in pooling mode via the option `--runner pooling`. - -!!! tip - There is no need to set this option in the vast majority of cases as vLLM can automatically - detect the appropriate model runner via `--runner auto`. - -### Model Conversion - -vLLM can adapt models for various pooling tasks via the option `--convert `. - -If `--runner pooling` has been set (manually or automatically) but the model does not implement the -[VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface, -vLLM will attempt to automatically convert the model according to the architecture names -shown in the table below. - -| Architecture | `--convert` | Supported pooling tasks | -| ----------------------------------------------- | ----------- | ------------------------------------- | -| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | -| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` | -| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` | - -!!! tip - You can explicitly set `--convert ` to specify how to convert the model. - -### Pooling Tasks - -Each pooling model in vLLM supports one or more of these tasks according to -[Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks], -enabling the corresponding APIs: - -| Task | APIs | -| ---------------- | ----------------------------------------------------------------------------- | -| `embed` | `LLM.embed(...)`, `LLM.score(...)`\*, `LLM.encode(..., pooling_task="embed")` | -| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` | -| `score` | `LLM.score(...)` | -| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` | -| `token_embed` | `LLM.encode(..., pooling_task="token_embed")` | -| `plugin` | `LLM.encode(..., pooling_task="plugin")` | - -\* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task. - -### Pooler Configuration - -#### Predefined models - -If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`, -you can override some of its attributes via the `--pooler-config` option. - -#### Converted models - -If the model has been converted via `--convert` (see above), -the pooler assigned to each task has the following attributes by default: - -| Task | Pooling Type | Normalization | Softmax | -| ---------- | ------------ | ------------- | ------- | -| `embed` | `LAST` | ✅︎ | ❌ | -| `classify` | `LAST` | ❌ | ✅︎ | - -When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, -its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults. - -You can further customize this via the `--pooler-config` option, -which takes priority over both the model's and Sentence Transformers' defaults. - -## Offline Inference - -The [LLM][vllm.LLM] class provides various methods for offline inference. -See [configuration](../api/README.md#configuration) for a list of options when initializing the model. - -### `LLM.embed` - -The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt. -It is primarily designed for embedding models. - -```python -from vllm import LLM - -llm = LLM(model="intfloat/e5-small", runner="pooling") -(output,) = llm.embed("Hello, my name is") - -embeds = output.outputs.embedding -print(f"Embeddings: {embeds!r} (size={len(embeds)})") -``` - -A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py) - -### `LLM.classify` - -The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt. -It is primarily designed for classification models. - -```python -from vllm import LLM - -llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling") -(output,) = llm.classify("Hello, my name is") - -probs = output.outputs.probs -print(f"Class Probabilities: {probs!r} (size={len(probs)})") -``` - -A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py) - -### `LLM.score` - -The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs. -It is designed for embedding models and cross-encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems. - -!!! note - vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. - To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). - -```python -from vllm import LLM - -llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") -(output,) = llm.score( - "What is the capital of France?", - "The capital of Brazil is Brasilia.", -) - -score = output.outputs.score -print(f"Score: {score}") -``` - -A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py) - -### `LLM.reward` - -The [reward][vllm.LLM.reward] method is available to all reward models in vLLM. - -```python -from vllm import LLM - -llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True) -(output,) = llm.reward("Hello, my name is") - -data = output.outputs.data -print(f"Data: {data!r}") -``` - -A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py) - -### `LLM.encode` - -The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. - -!!! note - Please use one of the more specific methods or set the task directly when using `LLM.encode`: - - - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`. - - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`. - - For similarity scores, use `LLM.score(...)`. - - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`. - - For token classification, use `pooling_task="token_classify"`. - - For multi-vector retrieval, use `pooling_task="token_embed"`. - - For IO Processor Plugins, use `pooling_task="plugin"`. - -```python -from vllm import LLM - -llm = LLM(model="intfloat/e5-small", runner="pooling") -(output,) = llm.encode("Hello, my name is", pooling_task="embed") - -data = output.outputs.data -print(f"Data: {data!r}") -``` - -## Online Serving - -Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: - -- [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models. -- [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models. -- [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models. -- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. - -!!! note - Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api): - - - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`. - - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`. - - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api). - - For rewards, use `"task":"token_classify"`. - - For token classification, use `"task":"token_classify"`. - - For multi-vector retrieval, use `"task":"token_embed"`. - - For IO Processor Plugins, use `"task":"plugin"`. - -```python -# start a supported embeddings model server with `vllm serve`, e.g. -# vllm serve intfloat/e5-small -import requests - -host = "localhost" -port = "8000" -model_name = "intfloat/e5-small" - -api_url = f"http://{host}:{port}/pooling" - -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -prompt = {"model": model_name, "input": prompts, "task": "embed"} - -response = requests.post(api_url, json=prompt) - -for output in response.json()["data"]: - data = output["data"] - print(f"Data: {data!r} (size={len(data)})") -``` - -## Matryoshka Embeddings - -[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost. - -!!! warning - Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. - - For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error. - - ```json - {"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400} - ``` - -### Manually enable Matryoshka Embeddings - -There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions. - -For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": []}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": []}'` (online). - -Here is an example to serve a model with Matryoshka Embeddings enabled. - -```bash -vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' -``` - -### Offline Inference - -You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams]. - -```python -from vllm import LLM, PoolingParams - -llm = LLM( - model="jinaai/jina-embeddings-v3", - runner="pooling", - trust_remote_code=True, -) -outputs = llm.embed( - ["Follow the white rabbit."], - pooling_params=PoolingParams(dimensions=32), -) -print(outputs[0].outputs) -``` - -A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy_offline.py](../../examples/pooling/embed/embed_matryoshka_fy_offline.py) - -### Online Inference - -Use the following command to start the vLLM server. - -```bash -vllm serve jinaai/jina-embeddings-v3 --trust-remote-code -``` - -You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter. - -```bash -curl http://127.0.0.1:8000/v1/embeddings \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "input": "Follow the white rabbit.", - "model": "jinaai/jina-embeddings-v3", - "encoding_format": "float", - "dimensions": 32 - }' -``` - -Expected output: - -```json -{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} -``` - -An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy_client.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy_client.py) - -## Specific models - -### ColBERT Late Interaction Models - -[ColBERT](https://arxiv.org/abs/2004.12832) (Contextualized Late Interaction over BERT) is a retrieval model that uses per-token embeddings and MaxSim scoring for document ranking. Unlike single-vector embedding models, ColBERT retains token-level representations and computes relevance scores through late interaction, providing better accuracy while being more efficient than cross-encoders. - -vLLM supports ColBERT models with multiple encoder backbones: - -| Architecture | Backbone | Example HF Models | -| - | - | - | -| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | -| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | -| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | - -**BERT-based ColBERT** models work out of the box: - -```shell -vllm serve answerdotai/answerai-colbert-small-v1 -``` - -For **non-BERT backbones**, use `--hf-overrides` to set the correct architecture: - -```shell -# ModernBERT backbone -vllm serve lightonai/GTE-ModernColBERT-v1 \ - --hf-overrides '{"architectures": ["ColBERTModernBertModel"]}' - -# Jina XLM-RoBERTa backbone -vllm serve jinaai/jina-colbert-v2 \ - --hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \ - --trust-remote-code -``` - -Then you can use the rerank endpoint: - -```shell -curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ - "model": "answerdotai/answerai-colbert-small-v1", - "query": "What is machine learning?", - "documents": [ - "Machine learning is a subset of artificial intelligence.", - "Python is a programming language.", - "Deep learning uses neural networks." - ] -}' -``` - -Or the score endpoint: - -```shell -curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ - "model": "answerdotai/answerai-colbert-small-v1", - "text_1": "What is machine learning?", - "text_2": ["Machine learning is a subset of AI.", "The weather is sunny."] -}' -``` - -You can also get the raw token embeddings using the pooling endpoint with `token_embed` task: - -```shell -curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ - "model": "answerdotai/answerai-colbert-small-v1", - "input": "What is machine learning?", - "task": "token_embed" -}' -``` - -An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../examples/pooling/score/colbert_rerank_online.py) - -### ColQwen3 Multi-Modal Late Interaction Models - -ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone. - -| Architecture | Backbone | Example HF Models | -| - | - | - | -| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` | -| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` | -| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` | - -Start the server: - -```shell -vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096 -``` - -#### Text-only scoring and reranking - -Use the `/rerank` endpoint: - -```shell -curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ - "model": "TomoroAI/tomoro-colqwen3-embed-4b", - "query": "What is machine learning?", - "documents": [ - "Machine learning is a subset of artificial intelligence.", - "Python is a programming language.", - "Deep learning uses neural networks." - ] -}' -``` - -Or the `/score` endpoint: - -```shell -curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ - "model": "TomoroAI/tomoro-colqwen3-embed-4b", - "text_1": "What is the capital of France?", - "text_2": ["The capital of France is Paris.", "Python is a programming language."] -}' -``` - -#### Multi-modal scoring and reranking (text query × image documents) - -The `/score` and `/rerank` endpoints also accept multi-modal inputs directly. -Pass image documents using the `data_1`/`data_2` (for `/score`) or `documents` (for `/rerank`) fields -with a `content` list containing `image_url` and `text` parts — the same format used by the -OpenAI chat completion API: - -Score a text query against image documents: - -```shell -curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ - "model": "TomoroAI/tomoro-colqwen3-embed-4b", - "data_1": "Retrieve the city of Beijing", - "data_2": [ - { - "content": [ - {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, - {"type": "text", "text": "Describe the image."} - ] - } - ] -}' -``` - -Rerank image documents by a text query: - -```shell -curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ - "model": "TomoroAI/tomoro-colqwen3-embed-4b", - "query": "Retrieve the city of Beijing", - "documents": [ - { - "content": [ - {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, - {"type": "text", "text": "Describe the image."} - ] - }, - { - "content": [ - {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, - {"type": "text", "text": "Describe the image."} - ] - } - ], - "top_n": 2 -}' -``` - -#### Raw token embeddings - -You can also get the raw token embeddings using the `/pooling` endpoint with `token_embed` task: - -```shell -curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ - "model": "TomoroAI/tomoro-colqwen3-embed-4b", - "input": "What is machine learning?", - "task": "token_embed" -}' -``` - -For **image inputs** via the pooling endpoint, use the chat-style `messages` field: - -```shell -curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ - "model": "TomoroAI/tomoro-colqwen3-embed-4b", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, - {"type": "text", "text": "Describe the image."} - ] - } - ] -}' -``` - -#### Examples - -- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py) -- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py) - -### Llama Nemotron Multimodal - -#### Embedding Model - -Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone -(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce -single-vector embeddings from text and/or images. - -| Architecture | Backbone | Example HF Models | -| - | - | - | -| `LlamaNemotronVLModel` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-embed-vl-1b-v2` | - -Start the server: - -```shell -vllm serve nvidia/llama-nemotron-embed-vl-1b-v2 \ - --trust-remote-code \ - --chat-template examples/pooling/embed/template/nemotron_embed_vl.jinja -``` - -!!! note - The chat template bundled with this model's tokenizer is not suitable for - the embeddings API. Use the provided override template above when serving - with the `messages`-based (chat-style) embeddings endpoint. - - The override template uses the message `role` to automatically prepend the - appropriate prefix: set `role` to `"query"` for queries (prepends `query: `) - or `"document"` for passages (prepends `passage: `). Any other role omits - the prefix. - -Embed text queries: - -```shell -curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{ - "model": "nvidia/llama-nemotron-embed-vl-1b-v2", - "messages": [ - { - "role": "query", - "content": [ - {"type": "text", "text": "What is machine learning?"} - ] - } - ] -}' -``` - -Embed images via the chat-style `messages` field: - -```shell -curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{ - "model": "nvidia/llama-nemotron-embed-vl-1b-v2", - "messages": [ - { - "role": "document", - "content": [ - {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, - {"type": "text", "text": "Describe the image."} - ] - } - ] -}' -``` - -#### Reranker Model - -Llama Nemotron VL reranker models combine the same bidirectional Llama + SigLIP -backbone with a sequence-classification head for cross-encoder scoring and reranking. - -| Architecture | Backbone | Example HF Models | -| - | - | - | -| `LlamaNemotronVLForSequenceClassification` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-rerank-vl-1b-v2` | - -Start the server: - -```shell -vllm serve nvidia/llama-nemotron-rerank-vl-1b-v2 \ - --runner pooling \ - --trust-remote-code \ - --chat-template examples/pooling/score/template/nemotron-vl-rerank.jinja -``` - -!!! note - The chat template bundled with this checkpoint's tokenizer is not suitable - for the Score/Rerank APIs. Use the provided override template when serving: - `examples/pooling/score/template/nemotron-vl-rerank.jinja`. - -Score a text query against an image document: - -```shell -curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ - "model": "nvidia/llama-nemotron-rerank-vl-1b-v2", - "data_1": "Find diagrams about autonomous robots", - "data_2": [ - { - "content": [ - {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, - {"type": "text", "text": "Robotics workflow diagram."} - ] - } - ] -}' -``` - -Rerank image documents by a text query: - -```shell -curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ - "model": "nvidia/llama-nemotron-rerank-vl-1b-v2", - "query": "Find diagrams about autonomous robots", - "documents": [ - { - "content": [ - {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, - {"type": "text", "text": "Robotics workflow diagram."} - ] - }, - { - "content": [ - {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, - {"type": "text", "text": "General skyline photo."} - ] - } - ], - "top_n": 2 -}' -``` - -### ColQwen3.5 Multi-Modal Late Interaction Models - -ColQwen3.5 is based on [ColPali](https://arxiv.org/abs/2407.01449), extending ColBERT's late interaction approach to **multi-modal** inputs. It uses the Qwen3.5 hybrid backbone (linear + full attention) and produces per-token L2-normalized vectors for MaxSim scoring. - -| Architecture | Backbone | Example HF Models | -| - | - | - | -| `ColQwen3_5` | Qwen3.5 | `athrael-soju/colqwen3.5-4.5B` | - -Start the server: - -```shell -vllm serve athrael-soju/colqwen3.5-4.5B --max-model-len 4096 -``` - -Then you can use the rerank endpoint: - -```shell -curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ - "model": "athrael-soju/colqwen3.5-4.5B", - "query": "What is machine learning?", - "documents": [ - "Machine learning is a subset of artificial intelligence.", - "Python is a programming language.", - "Deep learning uses neural networks." - ] -}' -``` - -Or the score endpoint: - -```shell -curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ - "model": "athrael-soju/colqwen3.5-4.5B", - "text_1": "What is the capital of France?", - "text_2": ["The capital of France is Paris.", "Python is a programming language."] -}' -``` - -An example can be found here: [examples/pooling/score/colqwen3_5_rerank_online.py](../../examples/pooling/score/colqwen3_5_rerank_online.py) - -### BAAI/bge-m3 - -The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json` -the architecture is declared as `XLMRobertaModel`, which makes `vLLM` load it as a vanilla ROBERTA model without the -extra weights. To load the full model weights, override its architecture like this: - -```shell -vllm serve BAAI/bge-m3 --hf-overrides '{"architectures": ["BgeM3EmbeddingModel"]}' -``` - -Then you obtain the sparse embeddings like this: - -```shell -curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ - "model": "BAAI/bge-m3", - "task": "token_classify", - "input": ["What is BGE M3?", "Definition of BM25"] -}' -``` - -Due to limitations in the output schema, the output consists of a list of -token scores for each token for each input. This means that you'll have to call -`/tokenize` as well to be able to pair tokens with scores. -Refer to the tests in `tests/models/language/pooling/test_bge_m3.py` to see how -to do that. - -You can obtain the colbert embeddings like this: - -```shell -curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ - "model": "BAAI/bge-m3", - "task": "token_embed", - "input": ["What is BGE M3?", "Definition of BM25"] -}' -``` - -## Deprecated Features - -### Encode task - -We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`: - -- `token_embed` is the same as `embed`, using normalization as the activation. -- `token_classify` is the same as `classify`, by default using softmax as the activation. - -Pooling models now default support all pooling, you can use it without any settings. - -- Extracting hidden states prefers using `token_embed` task. -- Reward models prefers using `token_classify` task. diff --git a/docs/models/pooling_models/README.md b/docs/models/pooling_models/README.md new file mode 100644 index 000000000..b34cc1efe --- /dev/null +++ b/docs/models/pooling_models/README.md @@ -0,0 +1,253 @@ +# Pooling Models + +!!! note + We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly. + + We plan to optimize pooling models in vLLM. Please comment on if you have any suggestions! + +## What are pooling models? + +Natural Language Processing (NLP) can be primarily divided into the following two types of tasks: + +- Natural Language Understanding (NLU) +- Natural Language Generation (NLG) + +The generative models supported by vLLM cover a variety of task types, such as the large language models (LLMs) we are familiar with, multimodal models (VLM) that handle multimodal inputs like images, videos, and audio, speech-to-text transcription models, and real-time models that support streaming input. Their common feature is the ability to generate text. Taking it a step further, vLLM-Omni supports the generation of multimodal content, including images, videos, and audio. + +As the capabilities of generative models continue to improve, the boundaries of these models are also constantly expanding. However, certain application scenarios still require specialized small language models to efficiently complete specific tasks. These models typically have the following characteristics: + +- They do not require content generation. +- They only need to perform very limited functions, without requiring strong generalization, creativity, or high intelligence. +- They demand extremely low latency and may operate on cost-constrained hardware. +- Text-only models typically have fewer than 1 billion parameters, while multimodal models generally have fewer than 10 billion parameters. + +Although these models are relatively small in scale, they are still based on the Transformer architecture, similar or even identical to the most advanced large language models today. Many recently released pooling models are also fine-tuned from large language models, allowing them to benefit from the continuous improvements in large models. This architecture similarity enables them to reuse much of vLLM’s infrastructure. If compatible, we would be happy to help them leverage the latest features of vLLM as well. + +### Sequence-wise Task and Token-wise Task + +The key distinction between sequence-wise task and token-wise task lies in their output granularity: sequence-wise task produces a single result for an entire input sequence, whereas token-wise task yields a result for each individual token within the sequence. + +Of course, we also have "plugin" tasks that allow users to customize input and output processors. For more information, please refer to [IO Processor Plugins](../../design/io_processor_plugins.md). + +### Pooling Tasks + +| Pooling Tasks | Granularity | Outputs | +|--------------------|---------------|-------------------------------------------------| +| `classify` | Sequence-wise | probability vector of classes for each sequence | +| `score` (see note) | Sequence-wise | reranker score for each sequence | +| `embed` | Sequence-wise | vector representations for each sequence | +| `token_classify` | Token-wise | probability vector of classes for each token | +| `token_embed` | Token-wise | vector representations for each token | + +!!! note + Within classification tasks, there is a specialized subcategory: Cross-encoder (aka reranker) models. These models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. + +### Score Types + +| Pooling Tasks | Granularity | Outputs | Score Types | scoring function | +|--------------------|---------------|-------------------------------------------------|--------------------|--------------------------| +| `classify` | Sequence-wise | probability vector of classes for each sequence | nan | nan | +| `score` (see note) | Sequence-wise | reranker score for each sequence | `cross-encoder` | linear classifier | +| `embed` | Sequence-wise | vector representations for each sequence | `bi-encoder` | cosine similarity | +| `token_classify` | Token-wise | probability vector of classes for each token | nan | nan | +| `token_embed` | Token-wise | vector representations for each token | `late-interaction` | late interaction(MaxSim) | + +The score models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`. + +### Pooling Usages + +| Pooling Usages | Description | +|-----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| +| Classification Usages | Predicting which predefined category, class, or label best corresponds to a given input. | +| Embedding Usages | Converts unstructured data (text, images, audio, etc.) into structured numerical vectors (embeddings). | +| Token Classification Usages | Token-wise classification | +| Token Embedding Usages | Token-wise embedding | +| Scoring Usages | Computes similarity scores between two inputs. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`. | +| Reward Usages | Evaluates the quality of outputs generated by a language model, acting as a proxy for human preferences. | + +We also have some special models that support multiple pooling tasks, or have specific usage scenarios, or support special inputs and outputs. + +For more detailed information, please refer to the link below. + +- [Classification Usages](classify.md) +- [Embedding Usages](embed.md) +- [Reward Usages](reward.md) +- [Token Classification Usages](token_classify.md) +- [Token Embedding Usages](token_embed.md) +- [Scoring Usages](scoring.md) +- [Specific Model Examples](specific_models.md) + +## Offline Inference + +Each pooling model in vLLM supports one or more of these tasks according to +[Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks], +enabling the corresponding APIs. + +### Offline APIs corresponding to pooling tasks + +| Task | APIs | +|------------------|----------------------------------------------------------------------------| +| `embed` | `LLM.embed(...)`,`LLM.encode(..., pooling_task="embed")`, `LLM.score(...)` | +| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` | +| `score` | `LLM.score(...)` | +| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` | +| `token_embed` | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)` | +| `plugin` | `LLM.encode(..., pooling_task="plugin")` | + +### `LLM.classify` + +The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt. +It is primarily designed for [classification models](classify.md). +For more information about `LLM.embed`, see [this page](classify.md#offline-inference). + +### `LLM.embed` + +The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt. +It is primarily designed for [embedding models](embed.md). +For more information about `LLM.embed`, see [this page](embed.md#offline-inference). + +### `LLM.score` + +The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs. +It is primarily designed for [score models](scoring.md). + +### `LLM.encode` + +The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. + +Please use one of the more specific methods or set the task directly when using `LLM.encode`, refer to the [table above](#offline-apis-corresponding-to-pooling-tasks). + +### Examples + +```python +from vllm import LLM + +llm = LLM(model="intfloat/e5-small", runner="pooling") +(output,) = llm.encode("Hello, my name is", pooling_task="embed") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +## Online Serving + +Our online Server provides endpoints that correspond to the offline APIs: + +- Corresponding to `LLM.embed`: + - [Cohere Embed API](embed.md#cohere-embed-api) (`/v2/embed`) + - [Openai-compatible Embeddings API](embed.md#openai-compatible-embeddings-api) (`/v1/embeddings`) +- Corresponding to `LLM.classify`: + - [Classification API](classify.md#online-serving)(`/classify`) +- Corresponding to `LLM.score`: + - [Score API](scoring.md#score-api)(`/score`) + - [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) +- Pooling API (`/pooling`) is similar to `LLM.encode`, being applicable to all types of pooling models. + +The following introduces the Pooling API. For other APIs, please refer to the link above. + +### Pooling API + +Our Pooling API (`/pooling`) is similar to `LLM.encode`, being applicable to all types of pooling models. + +The input format is the same as [Embeddings API](embed.md#openai-compatible-embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. + +Please use one of the more specific APIs or set the task directly when using the Pooling API, refer to the [table above](#offline-apis-corresponding-to-pooling-tasks). + +Code example: [examples/pooling/pooling/pooling_online.py](../../../examples/pooling/pooling/pooling_online.py) + +### Examples + +```python +# start a supported embeddings model server with `vllm serve`, e.g. +# vllm serve intfloat/e5-small +import requests + +host = "localhost" +port = "8000" +model_name = "intfloat/e5-small" + +api_url = f"http://{host}:{port}/pooling" + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +prompt = {"model": model_name, "input": prompts, "task": "embed"} + +response = requests.post(api_url, json=prompt) + +for output in response.json()["data"]: + data = output["data"] + print(f"Data: {data!r} (size={len(data)})") +``` + +## Configuration + +In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface. +These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input +before returning them. + +### Model Runner + +Run a model in pooling mode via the option `--runner pooling`. + +!!! tip + There is no need to set this option in the vast majority of cases as vLLM can automatically + detect the appropriate model runner via `--runner auto`. + +### Model Conversion + +vLLM can adapt models for various pooling tasks via the option `--convert `. + +If `--runner pooling` has been set (manually or automatically) but the model does not implement the +[VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface, +vLLM will attempt to automatically convert the model according to the architecture names +shown in the table below. + +| Architecture | `--convert` | Supported pooling tasks | +| ----------------------------------------------- | ----------- | ------------------------------------- | +| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | +| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` | +| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` | + +!!! tip + You can explicitly set `--convert ` to specify how to convert the model. + +### Pooler Configuration + +#### Predefined models + +If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`, +you can override some of its attributes via the `--pooler-config` option. + +#### Converted models + +If the model has been converted via `--convert` (see above), +the pooler assigned to each task has the following attributes by default: + +| Task | Pooling Type | Normalization | Softmax | +| ---------- | ------------ | ------------- | ------- | +| `embed` | `LAST` | ✅︎ | ❌ | +| `classify` | `LAST` | ❌ | ✅︎ | + +When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, +its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults. + +You can further customize this via the `--pooler-config` option, +which takes priority over both the model's and Sentence Transformers' defaults. + +## Removed Features + +### Encode task + +We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`: + +- `token_embed` is the same as `embed`, using normalization as the activation. +- `token_classify` is the same as `classify`, by default using softmax as the activation. + +Pooling models now default support all pooling, you can use it without any settings. + +- Extracting hidden states prefers using `token_embed` task. +- Named Entity Recognition (NER) and reward models prefers using `token_classify` task. diff --git a/docs/models/pooling_models/classify.md b/docs/models/pooling_models/classify.md new file mode 100644 index 000000000..10d7892b5 --- /dev/null +++ b/docs/models/pooling_models/classify.md @@ -0,0 +1,276 @@ +# Classification Usages + +Classification involves predicting which predefined category, class, or label best corresponds to a given input. + +## Summary + +- Model Usage: (sequence) classification +- Pooling Task: `classify` +- Offline APIs: + - `LLM.classify(...)` + - `LLM.encode(..., pooling_task="classify")` +- Online APIs: + - [Classification API](classify.md#online-serving) (`/classify`) + - Pooling API (`/pooling`) + +The key distinction between (sequence) classification and token classification lies in their output granularity: (sequence) classification produces a single result for an entire input sequence, whereas token classification yields a result for each individual token within the sequence. + +Many classification models support both (sequence) classification and token classification. For further details on token classification, please refer to [this page](token_classify.md). + +## Typical Use Cases + +### Classification + +The most fundamental application of classification models is to categorize input data into predefined classes. + +## Supported Models + +### Text-only Models + +| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ | +| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | | | +| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | +| `Qwen2ForSequenceClassification`C | Qwen2-based | `jason9693/Qwen2.5-1.5B-apeach` | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | + +### Multimodal Models + +!!! note + For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models). + +| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ | +| `Qwen2_5_VLForSequenceClassification`C | Qwen2_5_VL-based | T + IE+ + VE+ | `muziyongshixin/Qwen2.5-VL-7B-for-VideoCls` | | | +| `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | + +C Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion)) +\* Feature support is the same as that of the original model. + +If your model is not in the above list, we will try to automatically convert the model using +[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. + +### Cross-encoder Models + +Cross-encoder (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. Most classification models can also be used as [cross-encoder models](scoring.md#cross-encoder-models). For more information on cross-encoder models, please refer to [this page](scoring.md). + +--8<-- "docs/models/pooling_models/scoring.md:supported-score-models" + +### Reward Models + +Using (sequence) classification models as reward models. For more information, see [Reward Models](reward.md). + +--8<-- "docs/models/pooling_models/reward.md:supported-sequence-reward-models" + +## Offline Inference + +### Pooling Parameters + +The following [pooling parameters][vllm.PoolingParams] are supported. + +```python +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:classify-pooling-params" +``` + +### `LLM.classify` + +The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt. + +```python +from vllm import LLM + +llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling") +(output,) = llm.classify("Hello, my name is") + +probs = output.outputs.probs +print(f"Class Probabilities: {probs!r} (size={len(probs)})") +``` + +A code example can be found here: [examples/offline_inference/basic/classify.py](../../../examples/basic/offline_inference/classify.py) + +### `LLM.encode` + +The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. + +Set `pooling_task="classify"` when using `LLM.encode` for classification Models: + +```python +from vllm import LLM + +llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling") +(output,) = llm.encode("Hello, my name is", pooling_task="classify") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +## Online Serving + +### Classification API + +Online `/classify` API is similar to `LLM.classify`. + +#### Completion Parameters + +The following Classification API parameters are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params" + ``` + +The following extra parameters are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" + ``` + +#### Chat Parameters + +For chat-like input (i.e. if `messages` is passed), the following parameters are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params" + ``` + +these extra parameters are supported instead: + +??? code + + ```python + --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" + ``` + +#### Example Requests + +Code example: [examples/pooling/classify/classification_online.py](../../../examples/pooling/classify/classification_online.py) + +You can classify multiple texts by passing an array of strings: + +```bash +curl -v "http://127.0.0.1:8000/classify" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "jason9693/Qwen2.5-1.5B-apeach", + "input": [ + "Loved the new café—coffee was great.", + "This update broke everything. Frustrating." + ] + }' +``` + +??? console "Response" + + ```json + { + "id": "classify-7c87cac407b749a6935d8c7ce2a8fba2", + "object": "list", + "created": 1745383065, + "model": "jason9693/Qwen2.5-1.5B-apeach", + "data": [ + { + "index": 0, + "label": "Default", + "probs": [ + 0.565970778465271, + 0.4340292513370514 + ], + "num_classes": 2 + }, + { + "index": 1, + "label": "Spoiled", + "probs": [ + 0.26448777318000793, + 0.7355121970176697 + ], + "num_classes": 2 + } + ], + "usage": { + "prompt_tokens": 20, + "total_tokens": 20, + "completion_tokens": 0, + "prompt_tokens_details": null + } + } + ``` + +You can also pass a string directly to the `input` field: + +```bash +curl -v "http://127.0.0.1:8000/classify" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "jason9693/Qwen2.5-1.5B-apeach", + "input": "Loved the new café—coffee was great." + }' +``` + +??? console "Response" + + ```json + { + "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682", + "object": "list", + "created": 1745383213, + "model": "jason9693/Qwen2.5-1.5B-apeach", + "data": [ + { + "index": 0, + "label": "Default", + "probs": [ + 0.565970778465271, + 0.4340292513370514 + ], + "num_classes": 2 + } + ], + "usage": { + "prompt_tokens": 10, + "total_tokens": 10, + "completion_tokens": 0, + "prompt_tokens_details": null + } + } + ``` + +## More examples + +More examples can be found here: [examples/pooling/classify](../../../examples/pooling/classify) + +## Supported Features + +### Enable/disable activation + +You can enable or disable activation via `use_activation`. + +### Problem type (e.g. `multi_label_classification`) + +You can modify the `problem_type` via problem_type in the Hugging Face config. The supported problem types are: `single_label_classification`, `multi_label_classification`, and `regression`. + +Implement alignment with transformers [ForSequenceClassificationLoss](https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92). + +### Logit bias + +You can modify the `logit_bias` (aka `sigmoid_normalize`) through the logit_bias parameter in `vllm.config.PoolerConfig`. + +## Removed Features + +### Remove softmax from PoolingParams + +We have already removed `softmax` and `activation` from PoolingParams. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function. diff --git a/docs/models/pooling_models/embed.md b/docs/models/pooling_models/embed.md new file mode 100644 index 000000000..d1f70dba7 --- /dev/null +++ b/docs/models/pooling_models/embed.md @@ -0,0 +1,546 @@ +# Embedding Usages + +Embedding models are a class of machine learning models designed to transform unstructured data—such as text, images, or audio—into a structured numerical representation known as an embedding. + +## Summary + +- Model Usage: (sequence) embedding +- Pooling Task: `embed` +- Offline APIs: + - `LLM.embed(...)` + - `LLM.encode(..., pooling_task="embed")` + - `LLM.score(...)` +- Online APIs: + - [Cohere Embed API](embed.md#cohere-embed-api) (`/v2/embed`) + - [Openai-compatible Embeddings API](embed.md#openai-compatible-embeddings-api) (`/v1/embeddings`) + - Pooling API (`/pooling`) + +The primary distinction between (sequence) embedding and token embedding lies in their output granularity: (sequence) embedding produces a single embedding vector for an entire input sequence, whereas token embedding generates an embedding for each individual token within the sequence. + +Many embedding models support both (sequence) embedding and token embedding. For further details on token embedding, please refer to [this page](token_embed.md). + +## Typical Use Cases + +### Embedding + +The most basic use case of embedding models is to embed the inputs, e.g. for RAG. + +### Pairwise Similarity + +You can compute pairwise similarity scores to build a similarity matrix using the [Score API](scoring.md). + +## Supported Models + +--8<-- [start:supported-embed-models] + +### Text-only Models + +| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ | +| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | +| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | | +| `ErnieModel` | BERT-like Chinese ERNIE | `shibing624/text2vec-base-chinese-sentence` | | | +| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | +| `Gemma3TextModel`C | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | +| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | +| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | +| `LlamaBidirectionalModel`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ | +| `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | +| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | +| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | +| `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | +| `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | +| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | +| `VoyageQwen3BidirectionalEmbedModel`C | Voyage Qwen3-based with bidirectional attention | `voyageai/voyage-4-nano`, etc. | ✅︎ | ✅︎ | +| `XLMRobertaModel` | XLMRobertaModel-based | `BAAI/bge-m3` (see note), `intfloat/multilingual-e5-base`, `jinaai/jina-embeddings-v3` (see note), etc. | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | + +!!! note + The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture. + +!!! note + `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. + You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`. + +!!! note + For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. + See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). + +!!! note + The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings, See [this page](specific_models.md#baaibge-m3) for more information. + +!!! note + `jinaai/jina-embeddings-v3` supports multiple tasks through LoRA, while vllm temporarily only supports text-matching tasks by merging LoRA weights. + +### Multimodal Models + +!!! note + For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models). + +| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ | +| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | +| `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | | +| `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | +| `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | +| `Qwen3VLForConditionalGeneration`C | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ | +| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | | +| `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | + +C Automatically converted into an embedding model via `--convert embed`. ([details](./README.md#model-conversion)) +\* Feature support is the same as that of the original model. + +If your model is not in the above list, we will try to automatically convert the model using +[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings +of the whole prompt are extracted from the normalized hidden state corresponding to the last token. + +!!! note + Although vLLM supports automatically converting models of any architecture into embedding models via --convert embed, to get the best results, you should use pooling models that are specifically trained as such. + +--8<-- [end:supported-embed-models] + +## Offline Inference + +### Pooling Parameters + +The following [pooling parameters][vllm.PoolingParams] are supported. + +```python +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:embed-pooling-params" +``` + +### `LLM.embed` + +The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt. + +```python +from vllm import LLM + +llm = LLM(model="intfloat/e5-small", runner="pooling") +(output,) = llm.embed("Hello, my name is") + +embeds = output.outputs.embedding +print(f"Embeddings: {embeds!r} (size={len(embeds)})") +``` + +A code example can be found here: [examples/offline_inference/basic/embed.py](../../../examples/basic/offline_inference/embed.py) + +### `LLM.encode` + +The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. + +Set `pooling_task="embed"` when using `LLM.encode` for embedding Models: + +```python +from vllm import LLM + +llm = LLM(model="intfloat/e5-small", runner="pooling") +(output,) = llm.encode("Hello, my name is", pooling_task="embed") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +### `LLM.score` + +The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs. + +All models that support embedding task also support using the score API to compute similarity scores by calculating the cosine similarity of two input prompt's embeddings. + +```python +from vllm import LLM + +llm = LLM(model="intfloat/e5-small", runner="pooling") +(output,) = llm.score( + "What is the capital of France?", + "The capital of Brazil is Brasilia.", +) + +score = output.outputs.score +print(f"Score: {score}") +``` + +## Online Serving + +### OpenAI-Compatible Embeddings API + +Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); +you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. + +Code example: [examples/pooling/embed/openai_embedding_client.py](../../../examples/pooling/embed/openai_embedding_client.py) + +#### Completion Parameters + +The following Classification API parameters are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params" + ``` + +The following extra parameters are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params" + ``` + +#### Chat Parameters + +For chat-like input (i.e. if `messages` is passed), the following parameters are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params" + ``` + +these extra parameters are supported instead: + +??? code + + ```python + --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params" + --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params" + ``` + +#### Examples + +If the model has a [chat template](../../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](../../serving/openai_compatible_server.md#chat-api)) +which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations: + +??? code + + ```python + from openai import OpenAI + from openai._types import NOT_GIVEN, NotGiven + from openai.types.chat import ChatCompletionMessageParam + from openai.types.create_embedding_response import CreateEmbeddingResponse + + def create_chat_embeddings( + client: OpenAI, + *, + messages: list[ChatCompletionMessageParam], + model: str, + encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN, + ) -> CreateEmbeddingResponse: + return client.post( + "/embeddings", + cast_to=CreateEmbeddingResponse, + body={"messages": messages, "model": model, "encoding_format": encoding_format}, + ) + ``` + +##### Multi-modal inputs + +You can pass multi-modal inputs to embedding models by defining a custom chat template for the server +and passing a list of `messages` in the request. Refer to the examples below for illustration. + +=== "VLM2Vec" + + To serve the model: + + ```bash + vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \ + --trust-remote-code \ + --max-model-len 4096 \ + --chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja + ``` + + !!! important + Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--runner pooling` + to run this model in embedding mode instead of text generation mode. + + The custom chat template is completely different from the original one for this model, + and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) + + Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: + + ??? code + + ```python + from openai import OpenAI + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", + ) + image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + response = create_chat_embeddings( + client, + model="TIGER-Lab/VLM2Vec-Full", + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + } + ], + encoding_format="float", + ) + + print("Image embedding output:", response.data[0].embedding) + ``` + +=== "DSE-Qwen2-MRL" + + To serve the model: + + ```bash + vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \ + --trust-remote-code \ + --max-model-len 8192 \ + --chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja + ``` + + !!! important + Like with VLM2Vec, we have to explicitly pass `--runner pooling`. + + Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled + by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../../examples/pooling/embed/template/dse_qwen2_vl.jinja) + + !!! important + `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code + example below for details. + +Full example: [examples/pooling/embed/vision_embedding_online.py](../../../examples/pooling/embed/vision_embedding_online.py) + +### Cohere Embed API + +Our API is also compatible with [Cohere's Embed v2 API](https://docs.cohere.com/reference/embed) which adds support for some modern embedding feature such as truncation, output dimensions, embedding types, and input types. This endpoint works with any embedding model (including multimodal models). + +#### Cohere Embed API request parameters + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `model` | string | Yes | Model name | +| `input_type` | string | No | Prompt prefix key (model-dependent, see below) | +| `texts` | list[string] | No | Text inputs (use one of `texts`, `images`, or `inputs`) | +| `images` | list[string] | No | Base64 data URI images | +| `inputs` | list[object] | No | Mixed text and image content objects | +| `embedding_types` | list[string] | No | Output types (default: `["float"]`) | +| `output_dimension` | int | No | Truncate embeddings to this dimension (Matryoshka) | +| `truncate` | string | No | `END`, `START`, or `NONE` (default: `END`) | + +#### Text embedding + +```bash +curl -X POST "http://localhost:8000/v2/embed" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Snowflake/snowflake-arctic-embed-m-v1.5", + "input_type": "query", + "texts": ["Hello world", "How are you?"], + "embedding_types": ["float"] + }' +``` + +??? console "Response" + + ```json + { + "id": "embd-...", + "embeddings": { + "float": [ + [0.012, -0.034, ...], + [0.056, 0.078, ...] + ] + }, + "texts": ["Hello world", "How are you?"], + "meta": { + "api_version": {"version": "2"}, + "billed_units": {"input_tokens": 12} + } + } + ``` + +#### Mixed text and image inputs + +For multimodal models, you can embed images by passing base64 data URIs. The `inputs` field accepts a list of objects with mixed text and image content: + +```bash +curl -X POST "http://localhost:8000/v2/embed" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "google/siglip-so400m-patch14-384", + "inputs": [ + { + "content": [ + {"type": "text", "text": "A photo of a cat"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}} + ] + } + ], + "embedding_types": ["float"] + }' +``` + +#### Embedding types + +The `embedding_types` parameter controls the output format. Multiple types can be requested in a single call: + +| Type | Description | +| ---- | ----------- | +| `float` | Raw float32 embeddings (default) | +| `binary` | Bit-packed signed binary | +| `ubinary` | Bit-packed unsigned binary | +| `base64` | Little-endian float32 encoded as base64 | + +```bash +curl -X POST "http://localhost:8000/v2/embed" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Snowflake/snowflake-arctic-embed-m-v1.5", + "input_type": "query", + "texts": ["What is machine learning?"], + "embedding_types": ["float", "binary"] + }' +``` + +??? console "Response" + + ```json + { + "id": "embd-...", + "embeddings": { + "float": [[0.012, -0.034, ...]], + "binary": [[42, -117, ...]] + }, + "texts": ["What is machine learning?"], + "meta": { + "api_version": {"version": "2"}, + "billed_units": {"input_tokens": 8} + } + } + ``` + +#### Truncation + +The `truncate` parameter controls how inputs exceeding the model's maximum sequence length are handled: + +| Value | Behavior | +| ----- | --------- | +| `END` (default) | Keep the first tokens, drop the end | +| `START` | Keep the last tokens, drop the beginning | +| `NONE` | Return an error if the input is too long | + +#### Input type and prompt prefixes + +The `input_type` field selects a prompt prefix to prepend to each text input. The available values +depend on the model: + +- **Models with `task_instructions` in `config.json`**: The keys from the `task_instructions` dict are + the valid `input_type` values and the corresponding value is prepended to each text. +- **Models with `config_sentence_transformers.json` prompts**: The keys from the `prompts` dict are + the valid `input_type` values. For example, `Snowflake/snowflake-arctic-embed-xs` defines `"query"`, + so setting `input_type: "query"` prepends `"Represent this sentence for searching relevant passages: "`. +- **Other models**: `input_type` is not accepted and will raise a validation error if passed. + +## More examples + +More examples can be found here: [examples/pooling/embed](../../../examples/pooling/embed) + +## Supported Features + +### Enable/disable normalize + +You can enable or disable normalize via `use_activation`. + +### Matryoshka Embeddings + +[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost. + +!!! warning + Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. + + For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error. + + ```json + {"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400} + ``` + +#### Manually enable Matryoshka Embeddings + +There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions. + +For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": []}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": []}'` (online). + +Here is an example to serve a model with Matryoshka Embeddings enabled. + +```bash +vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' +``` + +#### Offline Inference + +You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams]. + +```python +from vllm import LLM, PoolingParams + +llm = LLM( + model="jinaai/jina-embeddings-v3", + runner="pooling", + trust_remote_code=True, +) +outputs = llm.embed( + ["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32), +) +print(outputs[0].outputs) +``` + +A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy_offline.py](../../../examples/pooling/embed/embed_matryoshka_fy_offline.py) + +#### Online Inference + +Use the following command to start the vLLM server. + +```bash +vllm serve jinaai/jina-embeddings-v3 --trust-remote-code +``` + +You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter. + +```bash +curl http://127.0.0.1:8000/v1/embeddings \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "input": "Follow the white rabbit.", + "model": "jinaai/jina-embeddings-v3", + "encoding_format": "float", + "dimensions": 32 + }' +``` + +Expected output: + +```json +{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} +``` + +An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy_client.py](../../../examples/pooling/embed/openai_embedding_matryoshka_fy_client.py) + +## Removed Features + +### Remove `normalize` from PoolingParams + +We have already removed `normalize` from PoolingParams, use `use_activation` instead. diff --git a/docs/models/pooling_models/reward.md b/docs/models/pooling_models/reward.md new file mode 100644 index 000000000..8555060e6 --- /dev/null +++ b/docs/models/pooling_models/reward.md @@ -0,0 +1,136 @@ +# Reward Usages + +A reward model (RM) is designed to evaluate and score the quality of outputs generated by a language model, acting as a proxy for human preferences. + +## Summary + +- Model Usage: reward +- Pooling Task: + +| Model Types | Pooling Tasks | +|------------------------------------|----------------| +| (sequence) (outcome) reward models | classify | +| token (outcome) reward models | token_classify | +| process reward models | token_classify | + +- Offline APIs: + - `LLM.encode(..., pooling_task="...")` +- Online APIs: + - Pooling API (`/pooling`) + +## Supported Models + +### Reward Models + +Using sequence classification models as (sequence) (outcome) reward models, the usage and supported features are the same as for normal [classification models](classify.md). + +--8<-- [start:supported-sequence-reward-models] + +| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ----------------- | -------------------- | ------------------------- | +| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | +| `Qwen3ForSequenceClassification`C | Qwen3-based | `Skywork/Skywork-Reward-V2-Qwen3-0.6B`, etc. | ✅︎ | ✅︎ | +| `LlamaForSequenceClassification`C | Llama-based | `Skywork/Skywork-Reward-V2-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | + +C Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion)) + +If your model is not in the above list, we will try to automatically convert the model using +[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. + +--8<-- [end:supported-sequence-reward-models] + +### Token Reward Models + +The key distinction between (sequence) classification and token classification lies in their output granularity: (sequence) classification produces a single result for an entire input sequence, whereas token classification yields a result for each individual token within the sequence. + +Using token classification models as token (outcome) reward models, the usage and supported features are the same as for normal [token classification models](token_classify.md). + +--8<-- [start:supported-token-reward-models] + +| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ----------------- | -------------------- | ------------------------- | +| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | + +C Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion)) + +If your model is not in the above list, we will try to automatically convert the model using +[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. + +--8<-- [end:supported-token-reward-models] + +### Process Reward Models + +The process reward models used for evaluating intermediate steps are crucial to achieving the desired outcome. + +| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ----------------- | -------------------- | ------------------------- | +| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | + +!!! important + For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, + e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. + +## Offline Inference + +### Pooling Parameters + +The following [pooling parameters][vllm.PoolingParams] are supported. + +```python +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:classify-pooling-params" +``` + +### `LLM.encode` + +The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. + +- Reward Models + +Set `pooling_task="classify"` when using `LLM.encode` for (sequence) (outcome) reward models: + +```python +from vllm import LLM + +llm = LLM(model="Skywork/Skywork-Reward-V2-Qwen3-0.6B", runner="pooling") +(output,) = llm.encode("Hello, my name is", pooling_task="classify") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +- Token Reward Models + +Set `pooling_task="token_classify"` when using `LLM.encode` for token (outcome) reward models: + +```python +from vllm import LLM + +llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True) +(output,) = llm.encode("Hello, my name is", pooling_task="token_classify") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +- Process Reward Models + +Set `pooling_task="token_classify"` when using `LLM.encode` for token (outcome) reward models: + +```python +from vllm import LLM + +llm = LLM(model="Qwen/Qwen2.5-Math-PRM-7B", runner="pooling") +(output,) = llm.encode("Hello, my name is", pooling_task="token_classify") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +## Online Serving + +Please refer to the [pooling API](README.md#pooling-api). Pooling task corresponding to reward model types refer to the [table above](#summary). diff --git a/docs/models/pooling_models/scoring.md b/docs/models/pooling_models/scoring.md new file mode 100644 index 000000000..6227b689a --- /dev/null +++ b/docs/models/pooling_models/scoring.md @@ -0,0 +1,448 @@ +# Scoring Usages + +The score models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`. + +!!! note + vLLM handles only the model inference component of RAG pipelines (such as embedding generation and reranking). For higher-level RAG orchestration, you should leverage integration frameworks like [LangChain](https://github.com/langchain-ai/langchain). + +## Summary + +- Model Usage: Scoring +- Pooling Task: + +| Score Types | Pooling Tasks | scoring function | +|--------------------|---------------|--------------------------| +| `cross-encoder` | `score` | linear classifier | +| `late-interaction` | `token_embed` | late interaction(MaxSim) | +| `bi-encoder` | `embed` | cosine similarity | + +- Offline APIs: + - `LLM.score` +- Online APIs: + - [Score API](scoring.md#score-api) (`/score`) + - [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) + +## Supported Models + +### Cross-encoder models + +[Cross-encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. + +--8<-- [start:supported-score-models] + +#### Text-only Models + +| Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ----------------- | ------------------------- | --------------------------- | --------------------------------------- | +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | | +| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ | +| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | | +| `LlamaBidirectionalForSequenceClassification`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ | +| `Qwen2ForSequenceClassification`C | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ | +| `Qwen3ForSequenceClassification`C | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | N/A | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | N/A | \* | \* | + +C Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion)) +\* Feature support is the same as that of the original model. + +!!! note + Some models require a specific prompt format to work correctly. + + You can find Example HF Models's corresponding score template in [examples/pooling/score/template/](../../../examples/pooling/score/template) + + Examples : [examples/pooling/score/using_template_offline.py](../../../examples/pooling/score/using_template_offline.py) [examples/pooling/score/using_template_online.py](../../../examples/pooling/score/using_template_online.py) + +!!! note + Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command. + + ```bash + vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}' + ``` + +!!! note + The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture. + +!!! note + Load the official original `mxbai-rerank-v2` by using the following command. + + ```bash + vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}' + ``` + +!!! note + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker_offline.py](../../../examples/pooling/score/qwen3_reranker_offline.py) [examples/pooling/score/qwen3_reranker_online.py](../../../examples/pooling/score/qwen3_reranker_online.py). + + ```bash + vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' + ``` + +#### Multimodal Models + +!!! note + For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models). + +| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ | +| `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ | +| `LlamaNemotronVLForSequenceClassification` | Llama Nemotron Reranker + SigLIP | T + IE+ | `nvidia/llama-nemotron-rerank-vl-1b-v2` | | | +| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + IE+ + VE+ | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ | + +C Automatically converted into a classification model via `--convert classify`. ([details](README.md#model-conversion)) +\* Feature support is the same as that of the original model. + +!!! note + Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`. + + ```bash + vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' + ``` + +--8<-- [end:supported-score-models] + +### Late-interaction models + +All models that support token embedding task also support using the score API to compute similarity scores by calculating the late interaction of two input prompts. See [this page](token_embed.md) for more information about token embedding models. + +--8<-- "docs/models/pooling_models/token_embed.md:supported-token-embed-models" + +### Bi-encoder + +All models that support embedding task also support using the score API to compute similarity scores by calculating the cosine similarity of two input prompt's embeddings. See [this page](embed.md) for more information about embedding models. + +--8<-- "docs/models/pooling_models/embed.md:supported-embed-models" + +## Offline Inference + +### Pooling Parameters + +The following [pooling parameters][vllm.PoolingParams] are only supported by cross-encoder models and do not work for late-interaction and bi-encoder models. + +```python +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:classify-pooling-params" +``` + +### `LLM.score` + +The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs. + +```python +from vllm import LLM + +llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") +(output,) = llm.score( + "What is the capital of France?", + "The capital of Brazil is Brasilia.", +) + +score = output.outputs.score +print(f"Score: {score}") +``` + +A code example can be found here: [examples/basic/offline_inference/score.py](../../../examples/basic/offline_inference/score.py) + +## Online Serving + +### Score API + +Our Score API (`/score`) is similar to `LLM.score`, compute similarity scores between two input prompts. + +#### Parameters + +The following Score API parameters are supported: + +```python +--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" +--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" +--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" +``` + +#### Examples + +##### Single inference + +You can pass a string to both `queries` and `documents`, forming a single sentence pair. + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "encoding_format": "float", + "queries": "What is the capital of France?", + "documents": "The capital of France is Paris." +}' +``` + +??? console "Response" + + ```json + { + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": 1 + } + ], + "usage": {} + } + ``` + +##### Batch inference + +You can pass a string to `queries` and a list to `documents`, forming multiple sentence pairs +where each pair is built from `queries` and a string in `documents`. +The total number of pairs is `len(documents)`. + +??? console "Request" + + ```bash + curl -X 'POST' \ + 'http://127.0.0.1:8000/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "queries": "What is the capital of France?", + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] + }' + ``` + +??? console "Response" + + ```json + { + "id": "score-request-id", + "object": "list", + "created": 693570, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": 0.001094818115234375 + }, + { + "index": 1, + "object": "score", + "score": 1 + } + ], + "usage": {} + } + ``` + +You can pass a list to both `queries` and `documents`, forming multiple sentence pairs +where each pair is built from a string in `queries` and the corresponding string in `documents` (similar to `zip()`). +The total number of pairs is `len(documents)`. + +??? console "Request" + + ```bash + curl -X 'POST' \ + 'http://127.0.0.1:8000/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "encoding_format": "float", + "queries": [ + "What is the capital of Brazil?", + "What is the capital of France?" + ], + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] + }' + ``` + +??? console "Response" + + ```json + { + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": 1 + }, + { + "index": 1, + "object": "score", + "score": 1 + } + ], + "usage": {} + } + ``` + +##### Multi-modal inputs + +You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration. + +=== "JinaVL-Reranker" + + To serve the model: + + ```bash + vllm serve jinaai/jina-reranker-m0 + ``` + + Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: + + ??? Code + + ```python + import requests + + response = requests.post( + "http://localhost:8000/v1/score", + json={ + "model": "jinaai/jina-reranker-m0", + "queries": "slm markdown", + "documents": [ + { + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + } + ], + }, + { + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + } + ] + }, + ], + }, + ) + response.raise_for_status() + response_json = response.json() + print("Scoring output:", response_json["data"][0]["score"]) + print("Scoring output:", response_json["data"][1]["score"]) + ``` +Full example: + +- [examples/pooling/score/vision_score_api_online.py](../../../examples/pooling/score/vision_score_api_online.py) +- [examples/pooling/score/vision_rerank_api_online.py](../../../examples/pooling/score/vision_rerank_api_online.py) + +### Rerank API + +`/rerank`, `/v1/rerank`, and `/v2/rerank` APIs are compatible with both [Jina AI's rerank API interface](https://jina.ai/reranker/) and +[Cohere's rerank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with +popular open-source tools. + +Code example: [examples/pooling/score/rerank_api_online.py](../../../examples/pooling/score/rerank_api_online.py) + +#### Parameters + +The following rerank api parameters are supported: + +```python +--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" +--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" +--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" +``` + +#### Examples + +Note that the `top_n` request parameter is optional and will default to the length of the `documents` field. +Result documents will be sorted by relevance, and the `index` property can be used to determine original order. + +??? console "Request" + + ```bash + curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/rerank' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-base", + "query": "What is the capital of France?", + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + "Horses and cows are both animals" + ] + }' + ``` + +??? console "Response" + + ```json + { + "id": "rerank-fae51b2b664d4ed38f5969b612edff77", + "model": "BAAI/bge-reranker-base", + "usage": { + "total_tokens": 56 + }, + "results": [ + { + "index": 1, + "document": { + "text": "The capital of France is Paris." + }, + "relevance_score": 0.99853515625 + }, + { + "index": 0, + "document": { + "text": "The capital of Brazil is Brasilia." + }, + "relevance_score": 0.0005860328674316406 + } + ] + } + ``` + +## More examples + +More examples can be found here: [examples/pooling/score](../../../examples/pooling/score) + +## Supported Features + +AS cross-encoder models are a subset of classification models that accept two prompts as input and output num_labels equal to 1, cross-encoder features should be consistent with (sequence) classification. For more information, see [this page](classify.md#supported-features). + +### Score Template + +Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template. + +Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](../../serving/openai_compatible_server.md#chat-template)). + +Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter: + +- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}` +- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}` + +This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future. + +Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../../examples/pooling/score/template/nemotron-rerank.jinja) + +### Enable/disable activation + +You can enable or disable activation via `use_activation` only works for cross-encoder models. diff --git a/docs/models/pooling_models/specific_models.md b/docs/models/pooling_models/specific_models.md new file mode 100644 index 000000000..4b0027a3d --- /dev/null +++ b/docs/models/pooling_models/specific_models.md @@ -0,0 +1,395 @@ +# Specific Model Examples + +## ColBERT Late Interaction Models + +[ColBERT](https://arxiv.org/abs/2004.12832) (Contextualized Late Interaction over BERT) is a retrieval model that uses per-token embeddings and MaxSim scoring for document ranking. Unlike single-vector embedding models, ColBERT retains token-level representations and computes relevance scores through late interaction, providing better accuracy while being more efficient than cross-encoders. + +vLLM supports ColBERT models with multiple encoder backbones: + +| Architecture | Backbone | Example HF Models | +| - | - | - | +| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | +| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | +| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | + +**BERT-based ColBERT** models work out of the box: + +```shell +vllm serve answerdotai/answerai-colbert-small-v1 +``` + +For **non-BERT backbones**, use `--hf-overrides` to set the correct architecture: + +```shell +# ModernBERT backbone +vllm serve lightonai/GTE-ModernColBERT-v1 \ + --hf-overrides '{"architectures": ["ColBERTModernBertModel"]}' + +# Jina XLM-RoBERTa backbone +vllm serve jinaai/jina-colbert-v2 \ + --hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \ + --trust-remote-code +``` + +Then you can use the rerank API: + +```shell +curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ + "model": "answerdotai/answerai-colbert-small-v1", + "query": "What is machine learning?", + "documents": [ + "Machine learning is a subset of artificial intelligence.", + "Python is a programming language.", + "Deep learning uses neural networks." + ] +}' +``` + +Or the score API: + +```shell +curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ + "model": "answerdotai/answerai-colbert-small-v1", + "text_1": "What is machine learning?", + "text_2": ["Machine learning is a subset of AI.", "The weather is sunny."] +}' +``` + +You can also get the raw token embeddings using the pooling API with `token_embed` task: + +```shell +curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ + "model": "answerdotai/answerai-colbert-small-v1", + "input": "What is machine learning?", + "task": "token_embed" +}' +``` + +An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../../examples/pooling/score/colbert_rerank_online.py) + +## ColQwen3 Multi-Modal Late Interaction Models + +ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone. + +| Architecture | Backbone | Example HF Models | +| - | - | - | +| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` | +| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` | +| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` | + +Start the server: + +```shell +vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096 +``` + +### Text-only scoring and reranking + +Use the `/rerank` API: + +```shell +curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "query": "What is machine learning?", + "documents": [ + "Machine learning is a subset of artificial intelligence.", + "Python is a programming language.", + "Deep learning uses neural networks." + ] +}' +``` + +Or the `/score` API: + +```shell +curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "text_1": "What is the capital of France?", + "text_2": ["The capital of France is Paris.", "Python is a programming language."] +}' +``` + +### Multi-modal scoring and reranking (text query × image documents) + +The `/score` and `/rerank` APIs also accept multi-modal inputs directly. +Pass image documents using the `data_1`/`data_2` (for `/score`) or `documents` (for `/rerank`) fields +with a `content` list containing `image_url` and `text` parts — the same format used by the +OpenAI chat completion API: + +Score a text query against image documents: + +```shell +curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "data_1": "Retrieve the city of Beijing", + "data_2": [ + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Describe the image."} + ] + } + ] +}' +``` + +Rerank image documents by a text query: + +```shell +curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "query": "Retrieve the city of Beijing", + "documents": [ + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Describe the image."} + ] + }, + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Describe the image."} + ] + } + ], + "top_n": 2 +}' +``` + +### Raw token embeddings + +You can also get the raw token embeddings using the `/pooling` API with `token_embed` task: + +```shell +curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "input": "What is machine learning?", + "task": "token_embed" +}' +``` + +For **image inputs** via the pooling API, use the chat-style `messages` field: + +```shell +curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Describe the image."} + ] + } + ] +}' +``` + +### Examples + +- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../../examples/pooling/token_embed/colqwen3_token_embed_online.py) +- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../../examples/pooling/score/colqwen3_rerank_online.py) + +## ColQwen3.5 Multi-Modal Late Interaction Models + +ColQwen3.5 is based on [ColPali](https://arxiv.org/abs/2407.01449), extending ColBERT's late interaction approach to **multi-modal** inputs. It uses the Qwen3.5 hybrid backbone (linear + full attention) and produces per-token L2-normalized vectors for MaxSim scoring. + +| Architecture | Backbone | Example HF Models | +| - | - | - | +| `ColQwen3_5` | Qwen3.5 | `athrael-soju/colqwen3.5-4.5B` | + +Start the server: + +```shell +vllm serve athrael-soju/colqwen3.5-4.5B --max-model-len 4096 +``` + +Then you can use the rerank endpoint: + +```shell +curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ + "model": "athrael-soju/colqwen3.5-4.5B", + "query": "What is machine learning?", + "documents": [ + "Machine learning is a subset of artificial intelligence.", + "Python is a programming language.", + "Deep learning uses neural networks." + ] +}' +``` + +Or the score endpoint: + +```shell +curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ + "model": "athrael-soju/colqwen3.5-4.5B", + "text_1": "What is the capital of France?", + "text_2": ["The capital of France is Paris.", "Python is a programming language."] +}' +``` + +An example can be found here: [examples/pooling/score/colqwen3_5_rerank_online.py](../../../examples/pooling/score/colqwen3_5_rerank_online.py) + +## Llama Nemotron Multimodal + +### Embedding Model + +Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone +(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce +single-vector embeddings from text and/or images. + +| Architecture | Backbone | Example HF Models | +| - | - | - | +| `LlamaNemotronVLModel` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-embed-vl-1b-v2` | + +Start the server: + +```shell +vllm serve nvidia/llama-nemotron-embed-vl-1b-v2 \ + --trust-remote-code \ + --chat-template examples/pooling/embed/template/nemotron_embed_vl.jinja +``` + +!!! note + The chat template bundled with this model's tokenizer is not suitable for + the embeddings API. Use the provided override template above when serving + with the `messages`-based (chat-style) embeddings API. + + The override template uses the message `role` to automatically prepend the + appropriate prefix: set `role` to `"query"` for queries (prepends `query: `) + or `"document"` for passages (prepends `passage: `). Any other role omits + the prefix. + +Embed text queries: + +```shell +curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{ + "model": "nvidia/llama-nemotron-embed-vl-1b-v2", + "messages": [ + { + "role": "query", + "content": [ + {"type": "text", "text": "What is machine learning?"} + ] + } + ] +}' +``` + +Embed images via the chat-style `messages` field: + +```shell +curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{ + "model": "nvidia/llama-nemotron-embed-vl-1b-v2", + "messages": [ + { + "role": "document", + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Describe the image."} + ] + } + ] +}' +``` + +### Reranker Model + +Llama Nemotron VL reranker models combine the same bidirectional Llama + SigLIP +backbone with a sequence-classification head for cross-encoder scoring and reranking. + +| Architecture | Backbone | Example HF Models | +| - | - | - | +| `LlamaNemotronVLForSequenceClassification` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-rerank-vl-1b-v2` | + +Start the server: + +```shell +vllm serve nvidia/llama-nemotron-rerank-vl-1b-v2 \ + --runner pooling \ + --trust-remote-code \ + --chat-template examples/pooling/score/template/nemotron-vl-rerank.jinja +``` + +!!! note + The chat template bundled with this checkpoint's tokenizer is not suitable + for the Score/Rerank APIs. Use the provided override template when serving: + `examples/pooling/score/template/nemotron-vl-rerank.jinja`. + +Score a text query against an image document: + +```shell +curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ + "model": "nvidia/llama-nemotron-rerank-vl-1b-v2", + "data_1": "Find diagrams about autonomous robots", + "data_2": [ + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Robotics workflow diagram."} + ] + } + ] +}' +``` + +Rerank image documents by a text query: + +```shell +curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ + "model": "nvidia/llama-nemotron-rerank-vl-1b-v2", + "query": "Find diagrams about autonomous robots", + "documents": [ + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Robotics workflow diagram."} + ] + }, + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "General skyline photo."} + ] + } + ], + "top_n": 2 +}' +``` + +## BAAI/bge-m3 + +The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json` +the architecture is declared as `XLMRobertaModel`, which makes `vLLM` load it as a vanilla ROBERTA model without the +extra weights. To load the full model weights, override its architecture like this: + +```shell +vllm serve BAAI/bge-m3 --hf-overrides '{"architectures": ["BgeM3EmbeddingModel"]}' +``` + +Then you obtain the sparse embeddings like this: + +```shell +curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ + "model": "BAAI/bge-m3", + "task": "token_classify", + "input": ["What is BGE M3?", "Definition of BM25"] +}' +``` + +Due to limitations in the output schema, the output consists of a list of +token scores for each token for each input. This means that you'll have to call +`/tokenize` as well to be able to pair tokens with scores. +Refer to the tests in `tests/models/language/pooling/test_bge_m3.py` to see how +to do that. + +You can obtain the colbert embeddings like this: + +```shell +curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ + "model": "BAAI/bge-m3", + "task": "token_embed", + "input": ["What is BGE M3?", "Definition of BM25"] +}' +``` diff --git a/docs/models/pooling_models/token_classify.md b/docs/models/pooling_models/token_classify.md new file mode 100644 index 000000000..c46a2bdf6 --- /dev/null +++ b/docs/models/pooling_models/token_classify.md @@ -0,0 +1,89 @@ +# Token Classification Usages + +## Summary + +- Model Usage: token classification +- Pooling Tasks: `token_classify` +- Offline APIs: + - `LLM.encode(..., pooling_task="token_classify")` +- Online APIs: + - Pooling API (`/pooling`) + +The key distinction between (sequence) classification and token classification lies in their output granularity: (sequence) classification produces a single result for an entire input sequence, whereas token classification yields a result for each individual token within the sequence. + +Many classification models support both (sequence) classification and token classification. For further details on (sequence) classification, please refer to [this page](classify.md). + +## Typical Use Cases + +### Named Entity Recognition (NER) + +For implementation examples, see: + +Offline: [examples/pooling/token_classify/ner_offline.py](../../../examples/pooling/token_classify/ner_offline.py) + +Online: [examples/pooling/token_classify/ner_online.py](../../../examples/pooling/token_classify/ner_online.py) + +### Sparse retrieval (lexical matching) + +The BAAI/bge-m3 model leverages token classification for sparse retrieval. For more information, see [this page](specific_models.md#baaibge-m3). + +## Supported Models + +| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ----------------- | --------------------------- | --------------------------------------- | +| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | +| `ErnieForTokenClassification` | BERT-like Chinese ERNIE | `gyr66/Ernie-3.0-base-chinese-finetuned-ner` | | | +| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | +| `Qwen3ForTokenClassification`C | Qwen3-based | `bd2lcco/Qwen3-0.6B-finetuned` | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | + +C Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion)) +\* Feature support is the same as that of the original model. + +If your model is not in the above list, we will try to automatically convert the model using +[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. + +### As Reward Models + +Using token classification models as reward models. For details on reward models, see [Reward Models](reward.md). + +--8<-- "docs/models/pooling_models/reward.md:supported-token-reward-models" + +## Offline Inference + +### Pooling Parameters + +The following [pooling parameters][vllm.PoolingParams] are supported. + +```python +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:classify-pooling-params" +``` + +### `LLM.encode` + +The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. + +Set `pooling_task="token_classify"` when using `LLM.encode` for token classification Models: + +```python +from vllm import LLM + +llm = LLM(model="boltuix/NeuroBERT-NER", runner="pooling") +(output,) = llm.encode("Hello, my name is", pooling_task="token_classify") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +## Online Serving + +Please refer to the [pooling API](README.md#pooling-api) and use `"task":"token_classify"`. + +## More examples + +More examples can be found here: [examples/pooling/token_classify](../../../examples/pooling/token_classify) + +## Supported Features + +Token classification features should be consistent with (sequence) classification. For more information, see [this page](classify.md#supported-features). diff --git a/docs/models/pooling_models/token_embed.md b/docs/models/pooling_models/token_embed.md new file mode 100644 index 000000000..c950d2e99 --- /dev/null +++ b/docs/models/pooling_models/token_embed.md @@ -0,0 +1,125 @@ +# Token Embedding Usages + +## Summary + +- Model Usage: Token classification models +- Pooling Tasks: `token_embed` +- Offline APIs: + - `LLM.encode(..., pooling_task="token_embed")` +- Online APIs: + - Pooling API (`/pooling`) + +The difference between the (sequence) embedding task and the token embedding task is that (sequence) embedding outputs one embedding for each sequence, while token embedding outputs a embedding for each token. + +Many embedding models support both (sequence) embedding and token embedding. For further details on (sequence) embedding, please refer to [this page](embed.md). + +## Typical Use Cases + +### Multi-Vector Retrieval + +For implementation examples, see: + +Offline: [examples/pooling/token_embed/multi_vector_retrieval_offline.py](../../../examples/pooling/token_embed/multi_vector_retrieval_offline.py) + +Online: [examples/pooling/token_embed/multi_vector_retrieval_online.py](../../../examples/pooling/token_embed/multi_vector_retrieval_online.py) + +### Late interaction + +Similarity scores can be computed using late interaction between two input prompts via the score API. For more information, see [Score API](scoring.md). + +### Extract last hidden states + +Models of any architecture can be converted into embedding models using `--convert embed`. Token embedding can then be used to extract the last hidden states from these models. + +## Supported Models + +--8<-- [start:supported-token-embed-models] + +### Text-only Models + +| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ----------------- | -------------------- | ------------------------- | +| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | | +| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | | +| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | + +### Multimodal Models + +!!! note + For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models). + +| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | +| ------------ | ------ | ----- | ----------------- | ------------------------------ | ------------------------------------------ | +| `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | | +| `ColPaliForRetrieval` | ColPali | T / I | `vidore/colpali-v1.3-hf` | | | +| `ColQwen3` | Qwen3-VL | T / I | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` | | | +| `ColQwen3_5` | ColQwen3.5 | T + I + V | `athrael-soju/colqwen3.5-4.5B-v3` | | | +| `OpsColQwen3Model` | Qwen3-VL | T / I | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` | | | +| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | T / I | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` | ✅︎ | ✅︎ | +| `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | + +C Automatically converted into an embedding model via `--convert embed`. ([details](./README.md#model-conversion)) +\* Feature support is the same as that of the original model. + +If your model is not in the above list, we will try to automatically convert the model using [as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. + +--8<-- [end:supported-token-embed-models] + +## Offline Inference + +### Pooling Parameters + +The following [pooling parameters][vllm.PoolingParams] are supported. + +```python +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:embed-pooling-params" +``` + +### `LLM.encode` + +The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. + +Set `pooling_task="token_embed"` when using `LLM.encode` for token embedding Models: + +```python +from vllm import LLM + +llm = LLM(model="answerdotai/answerai-colbert-small-v1", runner="pooling") +(output,) = llm.encode("Hello, my name is", pooling_task="token_embed") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +### `LLM.score` + +The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs. + +All models that support token embedding task also support using the score API to compute similarity scores by calculating the late interaction of two input prompts. + +```python +from vllm import LLM + +llm = LLM(model="answerdotai/answerai-colbert-small-v1", runner="pooling") +(output,) = llm.score( + "What is the capital of France?", + "The capital of Brazil is Brasilia.", +) + +score = output.outputs.score +print(f"Score: {score}") +``` + +## Online Serving + +Please refer to the [pooling API](README.md#pooling-api) and use `"task":"token_embed"`. + +## More examples + +More examples can be found here: [examples/pooling/token_embed](../../../examples/pooling/token_embed) + +## Supported Features + +Token embedding features should be consistent with (sequence) embedding. For more information, see [this page](embed.md#supported-features). diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f36f74308..07e7da344 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -1,6 +1,6 @@ # Supported Models -vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks. +vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models/README.md) models across various tasks. For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. @@ -499,156 +499,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -### Pooling Models - -See [this page](./pooling_models.md) for more information on how to use pooling models. - -!!! important - Since some model architectures support both generative and pooling tasks, - you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode. - -#### Embedding - -These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API. - -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | -| ------------ | ------ | ----------------- | -------------------- | ------------------------- | -| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | -| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | | -| `ErnieModel` | BERT-like Chinese ERNIE | `shibing624/text2vec-base-chinese-sentence` | | | -| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | -| `Gemma3TextModel`C | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | -| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | -| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | -| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | -| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | -| `LlamaBidirectionalModel`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ | -| `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | -| `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | -| `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | -| `VoyageQwen3BidirectionalEmbedModel`C | Voyage Qwen3-based with bidirectional attention | `voyageai/voyage-4-nano`, etc. | ✅︎ | ✅︎ | -| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | - -C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) -\* Feature support is the same as that of the original model. - -!!! note - `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. - You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`. - -!!! note - For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. - See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). - -!!! note - `jinaai/jina-embeddings-v3` supports multiple tasks through LoRA, while vllm temporarily only supports text-matching tasks by merging LoRA weights. - -!!! note - The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture. - -If your model is not in the above list, we will try to automatically convert the model using -[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings -of the whole prompt are extracted from the normalized hidden state corresponding to the last token. - -#### Classification - -These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API. - -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | -| ------------ | ------ | ----------------- | -------------------- | ------------------------- | -| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | | | -| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | -| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | - -C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) -\* Feature support is the same as that of the original model. - -If your model is not in the above list, we will try to automatically convert the model using -[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. - -#### Cross-encoder / Reranker - -Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. -These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. - -| Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | -| ------------ | ------ | ----------------- | ------------------------- | --------------------------- | --------------------------------------- | -| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | | -| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | N/A | | | -| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ | -| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | | -| `LlamaBidirectionalForSequenceClassification`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ | -| `Qwen2ForSequenceClassification`C | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ | -| `Qwen3ForSequenceClassification`C | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ | -| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | | -| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | N/A | | | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | N/A | \* | \* | - -C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) -\* Feature support is the same as that of the original model. - -!!! note - Some models require a specific prompt format to work correctly. - - You can find Example HF Models's corresponding score template in [examples/pooling/score/template/](../../examples/pooling/score/template) - - Examples : [examples/pooling/score/using_template_offline.py](../../examples/pooling/score/using_template_offline.py) [examples/pooling/score/using_template_online.py](../../examples/pooling/score/using_template_online.py) - -!!! note - Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command. - - ```bash - vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}' - ``` - -!!! note - The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture. - -!!! note - Load the official original `mxbai-rerank-v2` by using the following command. - - ```bash - vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}' - ``` - -!!! note - Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker_offline.py](../../examples/pooling/score/qwen3_reranker_offline.py) [examples/pooling/score/qwen3_reranker_online.py](../../examples/pooling/score/qwen3_reranker_online.py). - - ```bash - vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' - ``` - -#### Reward Modeling - -These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API. - -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | -| ------------ | ------ | ----------------- | -------------------- | ------------------------- | -| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | -| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | -| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | -| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | - -!!! important - For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, - e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. - -#### Token Classification - -These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API. - -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | -| ------------ | ------ | ----------------- | --------------------------- | --------------------------------------- | -| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | -| `ErnieForTokenClassification` | BERT-like Chinese ERNIE | `gyr66/Ernie-3.0-base-chinese-finetuned-ner` | | | -| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | - -!!! note - Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner_offline.py](../../examples/pooling/token_classify/ner_offline.py), [examples/pooling/token_classify/ner_online.py](../../examples/pooling/token_classify/ner_online.py). - ## List of Multimodal Language Models The following modalities are supported depending on the model: @@ -816,57 +666,23 @@ Speech2Text models trained specifically for Automatic Speech Recognition. !!! note `VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed. -### Pooling Models +## Pooling Models -See [this page](./pooling_models.md) for more information on how to use pooling models. +See [this page](pooling_models/README.md) for more information on how to use pooling models. -#### Embedding +!!! important + Since some model architectures support both generative and pooling tasks, + you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode. -These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API. +See the link below for more information on the models supported for specific pooling tasks. -!!! note - To get the best results, you should use pooling models that are specifically trained as such. - -The following table lists those that are tested in vLLM. - -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | -| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- | -| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | -| `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | | -| `ColPaliForRetrieval` | ColPali | T / I | `vidore/colpali-v1.3-hf` | | | -| `ColQwen3_5` | ColQwen3.5 | T + I + V | `athrael-soju/colqwen3.5-4.5B-v3` | | | -| `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | | -| `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | -| `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | -| `Qwen3VLForConditionalGeneration`C | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ | -| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | | -| `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | - -C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) -\* Feature support is the same as that of the original model. - ---- - -#### Cross-encoder / Reranker - -Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. -These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. - -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | -| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- | -| `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ | -| `LlamaNemotronVLForSequenceClassification` | Llama Nemotron Reranker + SigLIP | T + IE+ | `nvidia/llama-nemotron-rerank-vl-1b-v2` | | | -| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + IE+ + VE+ | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ | - -C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) -\* Feature support is the same as that of the original model. - -!!! note - Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`. - - ```bash - vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' - ``` +- [Classification Usages](pooling_models/classify.md) +- [Embedding Usages](pooling_models/embed.md) +- [Reward Usages](pooling_models/reward.md) +- [Token Classification Usages](pooling_models/token_classify.md) +- [Token Embedding Usages](pooling_models/token_embed.md) +- [Scoring Usages](pooling_models/scoring.md) +- [Specific Model Examples](pooling_models/specific_models.md) ## Model Support Policy diff --git a/docs/serving/offline_inference.md b/docs/serving/offline_inference.md index b3d211871..535bc2a62 100644 --- a/docs/serving/offline_inference.md +++ b/docs/serving/offline_inference.md @@ -16,7 +16,7 @@ After initializing the `LLM` instance, use the available APIs to perform model i The available APIs depend on the model type: - [Generative models](../models/generative_models.md) output logprobs which are sampled from to obtain the final output text. -- [Pooling models](../models/pooling_models.md) output their hidden states directly. +- [Pooling models](../models/pooling_models/README.md) output their hidden states directly. !!! info [API Reference](../api/README.md#offline-inference) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index cf44a1bfe..157904aa8 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -53,8 +53,8 @@ We currently support the following OpenAI APIs: - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template). - *Note: `user` parameter is ignored.* - *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls. -- [Embeddings API](#embeddings-api) (`/v1/embeddings`) - - Only applicable to [embedding models](../models/pooling_models.md). +- [Embeddings API](../models/pooling_models/embed.md#openai-compatible-embeddings-api) (`/v1/embeddings`) + - Only applicable to [embedding models](../models/pooling_models/embed.md). - [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`) - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription). - [Translation API](#translations-api) (`/v1/audio/translations`) @@ -66,20 +66,19 @@ In addition, we have the following custom APIs: - [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`) - Applicable to any model with a tokenizer. -- [Pooling API](#pooling-api) (`/pooling`) - - Applicable to all [pooling models](../models/pooling_models.md). -- [Classification API](#classification-api) (`/classify`) - - Only applicable to [classification models](../models/pooling_models.md). -- [Score API](#score-api) (`/score`) - - Applicable to [embedding models and cross-encoder models](../models/pooling_models.md). -- [Cohere Embed API](#cohere-embed-api) (`/v2/embed`) +- [pooling API](../models/pooling_models/README.md#pooling-api) (`/pooling`) + - Applicable to all [pooling models](../models/pooling_models/README.md). +- [Classification API](../models/pooling_models/classify.md#classification-api) (`/classify`) + - Only applicable to [classification models](../models/pooling_models/classify.md). +- [Cohere Embed API](../models/pooling_models/embed.md#cohere-embed-api) (`/v2/embed`) - Compatible with [Cohere's Embed API](https://docs.cohere.com/reference/embed) - - Works with any [embedding model](../models/pooling_models.md), including multimodal models. -- [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) - - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/) - - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) + - Works with any [embedding model](../models/pooling_models/embed.md#supported-models), including multimodal models. +- [Score API](../models/pooling_models/scoring.md#score-api) (`/score`) + - Applicable to [score models](../models/pooling_models/scoring.md). +- [Rerank API](../models/pooling_models/scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) + - Implements [Jina AI's v1 rerank API](https://jina.ai/reranker/) + - Also compatible with [Cohere's v1 & v2 rerank APIs](https://docs.cohere.com/v2/reference/rerank) - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. - - Only applicable to [cross-encoder models](../models/pooling_models.md). ## Chat Template @@ -269,300 +268,6 @@ The following extra parameters in the response object are supported: --8<-- "vllm/entrypoints/openai/responses/protocol.py:responses-response-extra-params" ``` -### Embeddings API - -Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); -you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. - -Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py) - -If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api)) -which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations: - -??? code - - ```python - from openai import OpenAI - from openai._types import NOT_GIVEN, NotGiven - from openai.types.chat import ChatCompletionMessageParam - from openai.types.create_embedding_response import CreateEmbeddingResponse - - def create_chat_embeddings( - client: OpenAI, - *, - messages: list[ChatCompletionMessageParam], - model: str, - encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN, - ) -> CreateEmbeddingResponse: - return client.post( - "/embeddings", - cast_to=CreateEmbeddingResponse, - body={"messages": messages, "model": model, "encoding_format": encoding_format}, - ) - ``` - -#### Multi-modal inputs - -You can pass multi-modal inputs to embedding models by defining a custom chat template for the server -and passing a list of `messages` in the request. Refer to the examples below for illustration. - -=== "VLM2Vec" - - To serve the model: - - ```bash - vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \ - --trust-remote-code \ - --max-model-len 4096 \ - --chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja - ``` - - !!! important - Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--runner pooling` - to run this model in embedding mode instead of text generation mode. - - The custom chat template is completely different from the original one for this model, - and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) - - Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: - - ??? code - - ```python - from openai import OpenAI - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="EMPTY", - ) - image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - - response = create_chat_embeddings( - client, - model="TIGER-Lab/VLM2Vec-Full", - messages=[ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, - ], - } - ], - encoding_format="float", - ) - - print("Image embedding output:", response.data[0].embedding) - ``` - -=== "DSE-Qwen2-MRL" - - To serve the model: - - ```bash - vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \ - --trust-remote-code \ - --max-model-len 8192 \ - --chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja - ``` - - !!! important - Like with VLM2Vec, we have to explicitly pass `--runner pooling`. - - Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled - by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../examples/pooling/embed/template/dse_qwen2_vl.jinja) - - !!! important - `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code - example below for details. - -Full example: [examples/pooling/embed/vision_embedding_online.py](../../examples/pooling/embed/vision_embedding_online.py) - -#### Extra parameters - -The following [pooling parameters][vllm.PoolingParams] are supported. - -```python ---8<-- "vllm/pooling_params.py:common-pooling-params" ---8<-- "vllm/pooling_params.py:embed-pooling-params" -``` - -The following Embeddings API parameters are supported: - -??? code - - ```python - --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params" - ``` - -The following extra parameters are supported: - -??? code - - ```python - --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params" - ``` - -For chat-like input (i.e. if `messages` is passed), the following parameters are supported: - -The following parameters are supported by default: - -??? code - - ```python - --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params" - ``` - -these extra parameters are supported instead: - -??? code - - ```python - --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params" - ``` - -### Cohere Embed API - -Our API is also compatible with [Cohere's Embed v2 API](https://docs.cohere.com/reference/embed) which adds support for some modern embedding feature such as truncation, output dimensions, embedding types, and input types. This endpoint works with any embedding model (including multimodal models). - -#### Cohere Embed API request parameters - -| Parameter | Type | Required | Description | -| --------- | ---- | -------- | ----------- | -| `model` | string | Yes | Model name | -| `input_type` | string | No | Prompt prefix key (model-dependent, see below) | -| `texts` | list[string] | No | Text inputs (use one of `texts`, `images`, or `inputs`) | -| `images` | list[string] | No | Base64 data URI images | -| `inputs` | list[object] | No | Mixed text and image content objects | -| `embedding_types` | list[string] | No | Output types (default: `["float"]`) | -| `output_dimension` | int | No | Truncate embeddings to this dimension (Matryoshka) | -| `truncate` | string | No | `END`, `START`, or `NONE` (default: `END`) | - -#### Text embedding - -```bash -curl -X POST "http://localhost:8000/v2/embed" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Snowflake/snowflake-arctic-embed-m-v1.5", - "input_type": "query", - "texts": ["Hello world", "How are you?"], - "embedding_types": ["float"] - }' -``` - -??? console "Response" - - ```json - { - "id": "embd-...", - "embeddings": { - "float": [ - [0.012, -0.034, ...], - [0.056, 0.078, ...] - ] - }, - "texts": ["Hello world", "How are you?"], - "meta": { - "api_version": {"version": "2"}, - "billed_units": {"input_tokens": 12} - } - } - ``` - -#### Mixed text and image inputs - -For multimodal models, you can embed images by passing base64 data URIs. The `inputs` field accepts a list of objects with mixed text and image content: - -```bash -curl -X POST "http://localhost:8000/v2/embed" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "google/siglip-so400m-patch14-384", - "inputs": [ - { - "content": [ - {"type": "text", "text": "A photo of a cat"}, - {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}} - ] - } - ], - "embedding_types": ["float"] - }' -``` - -#### Embedding types - -The `embedding_types` parameter controls the output format. Multiple types can be requested in a single call: - -| Type | Description | -| ---- | ----------- | -| `float` | Raw float32 embeddings (default) | -| `binary` | Bit-packed signed binary | -| `ubinary` | Bit-packed unsigned binary | -| `base64` | Little-endian float32 encoded as base64 | - -```bash -curl -X POST "http://localhost:8000/v2/embed" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Snowflake/snowflake-arctic-embed-m-v1.5", - "input_type": "query", - "texts": ["What is machine learning?"], - "embedding_types": ["float", "binary"] - }' -``` - -??? console "Response" - - ```json - { - "id": "embd-...", - "embeddings": { - "float": [[0.012, -0.034, ...]], - "binary": [[42, -117, ...]] - }, - "texts": ["What is machine learning?"], - "meta": { - "api_version": {"version": "2"}, - "billed_units": {"input_tokens": 8} - } - } - ``` - -#### Truncation - -The `truncate` parameter controls how inputs exceeding the model's maximum sequence length are handled: - -| Value | Behavior | -| ----- | --------- | -| `END` (default) | Keep the first tokens, drop the end | -| `START` | Keep the last tokens, drop the beginning | -| `NONE` | Return an error if the input is too long | - -#### Input type and prompt prefixes - -The `input_type` field selects a prompt prefix to prepend to each text input. The available values -depend on the model: - -- **Models with `task_instructions` in `config.json`**: The keys from the `task_instructions` dict are - the valid `input_type` values and the corresponding value is prepended to each text. -- **Models with `config_sentence_transformers.json` prompts**: The keys from the `prompts` dict are - the valid `input_type` values. For example, `Snowflake/snowflake-arctic-embed-xs` defines `"query"`, - so setting `input_type: "query"` prepends `"Represent this sentence for searching relevant passages: "`. -- **Other models**: `input_type` is not accepted and will raise a validation error if passed. - ### Transcriptions API Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription); @@ -759,172 +464,8 @@ It consists of two endpoints: - `/tokenize` corresponds to calling `tokenizer.encode()`. - `/detokenize` corresponds to calling `tokenizer.decode()`. -### Pooling API - -Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. - -The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. - -Code example: [examples/pooling/pooling/pooling_online.py](../../examples/pooling/pooling/pooling_online.py) - -### Classification API - -Our Classification API directly supports Hugging Face sequence-classification models such as [ai21labs/Jamba-tiny-reward-dev](https://huggingface.co/ai21labs/Jamba-tiny-reward-dev) and [jason9693/Qwen2.5-1.5B-apeach](https://huggingface.co/jason9693/Qwen2.5-1.5B-apeach). - -We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities. - -Code example: [examples/pooling/classify/classification_online.py](../../examples/pooling/classify/classification_online.py) - -#### Example Requests - -You can classify multiple texts by passing an array of strings: - -```bash -curl -v "http://127.0.0.1:8000/classify" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "jason9693/Qwen2.5-1.5B-apeach", - "input": [ - "Loved the new café—coffee was great.", - "This update broke everything. Frustrating." - ] - }' -``` - -??? console "Response" - - ```json - { - "id": "classify-7c87cac407b749a6935d8c7ce2a8fba2", - "object": "list", - "created": 1745383065, - "model": "jason9693/Qwen2.5-1.5B-apeach", - "data": [ - { - "index": 0, - "label": "Default", - "probs": [ - 0.565970778465271, - 0.4340292513370514 - ], - "num_classes": 2 - }, - { - "index": 1, - "label": "Spoiled", - "probs": [ - 0.26448777318000793, - 0.7355121970176697 - ], - "num_classes": 2 - } - ], - "usage": { - "prompt_tokens": 20, - "total_tokens": 20, - "completion_tokens": 0, - "prompt_tokens_details": null - } - } - ``` - -You can also pass a string directly to the `input` field: - -```bash -curl -v "http://127.0.0.1:8000/classify" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "jason9693/Qwen2.5-1.5B-apeach", - "input": "Loved the new café—coffee was great." - }' -``` - -??? console "Response" - - ```json - { - "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682", - "object": "list", - "created": 1745383213, - "model": "jason9693/Qwen2.5-1.5B-apeach", - "data": [ - { - "index": 0, - "label": "Default", - "probs": [ - 0.565970778465271, - 0.4340292513370514 - ], - "num_classes": 2 - } - ], - "usage": { - "prompt_tokens": 10, - "total_tokens": 10, - "completion_tokens": 0, - "prompt_tokens_details": null - } - } - ``` - -#### Extra parameters - -The following [pooling parameters][vllm.PoolingParams] are supported. - -```python ---8<-- "vllm/pooling_params.py:common-pooling-params" ---8<-- "vllm/pooling_params.py:classify-pooling-params" -``` - -The following Classification API parameters are supported: - -??? code - - ```python - --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params" - ``` - -The following extra parameters are supported: - -??? code - - ```python - --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" - ``` - -For chat-like input (i.e. if `messages` is passed), the following parameters are supported: - -??? code - - ```python - --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params" - ``` - -these extra parameters are supported instead: - -??? code - - ```python - --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params" - --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" - ``` - ### Score API -Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair. -Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1. - -You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). - -Code example: [examples/pooling/score/score_api_online.py](../../examples/pooling/score/score_api_online.py) - #### Score Template Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)). @@ -940,307 +481,6 @@ This approach is more robust than index-based access (`messages[0]`, `messages[1 Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) -#### Single inference - -You can pass a string to both `queries` and `documents`, forming a single sentence pair. - -```bash -curl -X 'POST' \ - 'http://127.0.0.1:8000/score' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-v2-m3", - "encoding_format": "float", - "queries": "What is the capital of France?", - "documents": "The capital of France is Paris." -}' -``` - -??? console "Response" - - ```json - { - "id": "score-request-id", - "object": "list", - "created": 693447, - "model": "BAAI/bge-reranker-v2-m3", - "data": [ - { - "index": 0, - "object": "score", - "score": 1 - } - ], - "usage": {} - } - ``` - -#### Batch inference - -You can pass a string to `queries` and a list to `documents`, forming multiple sentence pairs -where each pair is built from `queries` and a string in `documents`. -The total number of pairs is `len(documents)`. - -??? console "Request" - - ```bash - curl -X 'POST' \ - 'http://127.0.0.1:8000/score' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-v2-m3", - "queries": "What is the capital of France?", - "documents": [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris." - ] - }' - ``` - -??? console "Response" - - ```json - { - "id": "score-request-id", - "object": "list", - "created": 693570, - "model": "BAAI/bge-reranker-v2-m3", - "data": [ - { - "index": 0, - "object": "score", - "score": 0.001094818115234375 - }, - { - "index": 1, - "object": "score", - "score": 1 - } - ], - "usage": {} - } - ``` - -You can pass a list to both `queries` and `documents`, forming multiple sentence pairs -where each pair is built from a string in `queries` and the corresponding string in `documents` (similar to `zip()`). -The total number of pairs is `len(documents)`. - -??? console "Request" - - ```bash - curl -X 'POST' \ - 'http://127.0.0.1:8000/score' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-v2-m3", - "encoding_format": "float", - "queries": [ - "What is the capital of Brazil?", - "What is the capital of France?" - ], - "documents": [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris." - ] - }' - ``` - -??? console "Response" - - ```json - { - "id": "score-request-id", - "object": "list", - "created": 693447, - "model": "BAAI/bge-reranker-v2-m3", - "data": [ - { - "index": 0, - "object": "score", - "score": 1 - }, - { - "index": 1, - "object": "score", - "score": 1 - } - ], - "usage": {} - } - ``` - -#### Multi-modal inputs - -You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration. - -=== "JinaVL-Reranker" - - To serve the model: - - ```bash - vllm serve jinaai/jina-reranker-m0 - ``` - - Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: - - ??? Code - - ```python - import requests - - response = requests.post( - "http://localhost:8000/v1/score", - json={ - "model": "jinaai/jina-reranker-m0", - "queries": "slm markdown", - "documents": [ - { - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" - }, - } - ], - }, - { - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" - }, - } - ] - }, - ], - }, - ) - response.raise_for_status() - response_json = response.json() - print("Scoring output:", response_json["data"][0]["score"]) - print("Scoring output:", response_json["data"][1]["score"]) - ``` -Full example: - -- [examples/pooling/score/vision_score_api_online.py](../../examples/pooling/score/vision_score_api_online.py) -- [examples/pooling/score/vision_rerank_api_online.py](../../examples/pooling/score/vision_rerank_api_online.py) - -#### Extra parameters - -The following [pooling parameters][vllm.PoolingParams] are supported. - -```python ---8<-- "vllm/pooling_params.py:common-pooling-params" ---8<-- "vllm/pooling_params.py:classify-pooling-params" -``` - -The following Score API parameters are supported: - -```python ---8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" -``` - -The following extra parameters are supported: - -```python ---8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" ---8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" -``` - -### Re-rank API - -Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and -each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences or multi-modal inputs (image, etc.), on a scale of 0 to 1. - -You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). - -The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the -`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank` -endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and -[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with -popular open-source tools. - -Code example: [examples/pooling/score/rerank_api_online.py](../../examples/pooling/score/rerank_api_online.py) - -#### Example Request - -Note that the `top_n` request parameter is optional and will default to the length of the `documents` field. -Result documents will be sorted by relevance, and the `index` property can be used to determine original order. - -??? console "Request" - - ```bash - curl -X 'POST' \ - 'http://127.0.0.1:8000/v1/rerank' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-base", - "query": "What is the capital of France?", - "documents": [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - "Horses and cows are both animals" - ] - }' - ``` - -??? console "Response" - - ```json - { - "id": "rerank-fae51b2b664d4ed38f5969b612edff77", - "model": "BAAI/bge-reranker-base", - "usage": { - "total_tokens": 56 - }, - "results": [ - { - "index": 1, - "document": { - "text": "The capital of France is Paris." - }, - "relevance_score": 0.99853515625 - }, - { - "index": 0, - "document": { - "text": "The capital of Brazil is Brasilia." - }, - "relevance_score": 0.0005860328674316406 - } - ] - } - ``` - -#### Extra parameters - -The following [pooling parameters][vllm.PoolingParams] are supported. - -```python ---8<-- "vllm/pooling_params.py:common-pooling-params" ---8<-- "vllm/pooling_params.py:classify-pooling-params" -``` - -The following Re-rank API parameters are supported: - -```python ---8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" ---8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" -``` - -The following extra parameters are supported: - -```python ---8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" ---8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" -``` - ## Ray Serve LLM Ray Serve LLM enables scalable, production-grade serving of the vLLM engine. It integrates tightly with vLLM and extends it with features such as auto-scaling, load balancing, and back-pressure.