Migrate docs from Sphinx to MkDocs (#18145)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
@@ -1,8 +0,0 @@
|
||||
.vertical-table-header th.head:not(.stub) {
|
||||
writing-mode: sideways-lr;
|
||||
white-space: nowrap;
|
||||
max-width: 0;
|
||||
p {
|
||||
margin: 0;
|
||||
}
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
// Add RunLLM widget
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
var script = document.createElement("script");
|
||||
script.type = "module";
|
||||
script.id = "runllm-widget-script"
|
||||
|
||||
script.src = "https://widget.runllm.com";
|
||||
|
||||
script.setAttribute("version", "stable");
|
||||
script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
|
||||
script.setAttribute("runllm-name", "vLLM");
|
||||
script.setAttribute("runllm-position", "BOTTOM_RIGHT");
|
||||
script.setAttribute("runllm-position-y", "120px");
|
||||
script.setAttribute("runllm-position-x", "20px");
|
||||
script.setAttribute("runllm-assistant-id", "207");
|
||||
|
||||
script.async = true;
|
||||
document.head.appendChild(script);
|
||||
});
|
||||
|
||||
// Update URL search params when tab is clicked
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
const tabs = document.querySelectorAll(".sd-tab-label");
|
||||
|
||||
function updateURL(tab) {
|
||||
const syncGroup = tab.getAttribute("data-sync-group");
|
||||
const syncId = tab.getAttribute("data-sync-id");
|
||||
if (syncGroup && syncId) {
|
||||
const url = new URL(window.location);
|
||||
url.searchParams.set(syncGroup, syncId);
|
||||
window.history.replaceState(null, "", url);
|
||||
}
|
||||
}
|
||||
|
||||
tabs.forEach(tab => {
|
||||
tab.addEventListener("click", () => updateURL(tab));
|
||||
});
|
||||
});
|
||||
@@ -1,39 +0,0 @@
|
||||
<style>
|
||||
.notification-bar {
|
||||
width: 100vw;
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
font-size: 16px;
|
||||
padding: 0 6px 0 6px;
|
||||
}
|
||||
.notification-bar p {
|
||||
margin: 0;
|
||||
}
|
||||
.notification-bar a {
|
||||
font-weight: bold;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
/* Light mode styles (default) */
|
||||
.notification-bar {
|
||||
background-color: #fff3cd;
|
||||
color: #856404;
|
||||
}
|
||||
.notification-bar a {
|
||||
color: #d97706;
|
||||
}
|
||||
|
||||
/* Dark mode styles */
|
||||
html[data-theme=dark] .notification-bar {
|
||||
background-color: #333;
|
||||
color: #ddd;
|
||||
}
|
||||
html[data-theme=dark] .notification-bar a {
|
||||
color: #ffa500; /* Brighter color for visibility */
|
||||
}
|
||||
</style>
|
||||
|
||||
<div class="notification-bar">
|
||||
<p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
|
||||
</div>
|
||||
@@ -1,133 +0,0 @@
|
||||
# Summary
|
||||
|
||||
(configuration)=
|
||||
|
||||
## Configuration
|
||||
|
||||
API documentation for vLLM's configuration classes.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.config.ModelConfig
|
||||
vllm.config.CacheConfig
|
||||
vllm.config.TokenizerPoolConfig
|
||||
vllm.config.LoadConfig
|
||||
vllm.config.ParallelConfig
|
||||
vllm.config.SchedulerConfig
|
||||
vllm.config.DeviceConfig
|
||||
vllm.config.SpeculativeConfig
|
||||
vllm.config.LoRAConfig
|
||||
vllm.config.PromptAdapterConfig
|
||||
vllm.config.MultiModalConfig
|
||||
vllm.config.PoolerConfig
|
||||
vllm.config.DecodingConfig
|
||||
vllm.config.ObservabilityConfig
|
||||
vllm.config.KVTransferConfig
|
||||
vllm.config.CompilationConfig
|
||||
vllm.config.VllmConfig
|
||||
```
|
||||
|
||||
(offline-inference-api)=
|
||||
|
||||
## Offline Inference
|
||||
|
||||
LLM Class.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.LLM
|
||||
```
|
||||
|
||||
LLM Inputs.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.inputs.PromptType
|
||||
vllm.inputs.TextPrompt
|
||||
vllm.inputs.TokensPrompt
|
||||
```
|
||||
|
||||
## vLLM Engines
|
||||
|
||||
Engine classes for offline and online inference.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.LLMEngine
|
||||
vllm.AsyncLLMEngine
|
||||
```
|
||||
|
||||
## Inference Parameters
|
||||
|
||||
Inference parameters for vLLM APIs.
|
||||
|
||||
(sampling-params)=
|
||||
(pooling-params)=
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.SamplingParams
|
||||
vllm.PoolingParams
|
||||
```
|
||||
|
||||
(multi-modality)=
|
||||
|
||||
## Multi-Modality
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
|
||||
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
|
||||
via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
|
||||
|
||||
Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.MULTIMODAL_REGISTRY
|
||||
```
|
||||
|
||||
### Inputs
|
||||
|
||||
User-facing inputs.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.inputs.MultiModalDataDict
|
||||
```
|
||||
|
||||
Internal data structures.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.inputs.PlaceholderRange
|
||||
vllm.multimodal.inputs.NestedTensors
|
||||
vllm.multimodal.inputs.MultiModalFieldElem
|
||||
vllm.multimodal.inputs.MultiModalFieldConfig
|
||||
vllm.multimodal.inputs.MultiModalKwargsItem
|
||||
vllm.multimodal.inputs.MultiModalKwargs
|
||||
vllm.multimodal.inputs.MultiModalInputs
|
||||
```
|
||||
|
||||
### Data Parsing
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.parse
|
||||
```
|
||||
|
||||
### Data Processing
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.processing
|
||||
```
|
||||
|
||||
### Memory Profiling
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.profiling
|
||||
```
|
||||
|
||||
### Registry
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.registry
|
||||
```
|
||||
|
||||
## Model Development
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.model_executor.models.interfaces_base
|
||||
vllm.model_executor.models.interfaces
|
||||
vllm.model_executor.models.adapters
|
||||
```
|
||||
|
Before Width: | Height: | Size: 118 KiB |
|
Before Width: | Height: | Size: 136 KiB |
|
Before Width: | Height: | Size: 110 KiB |
|
Before Width: | Height: | Size: 111 KiB |
|
Before Width: | Height: | Size: 968 KiB |
|
Before Width: | Height: | Size: 107 KiB |
|
Before Width: | Height: | Size: 95 KiB |
|
Before Width: | Height: | Size: 143 KiB |
|
Before Width: | Height: | Size: 265 KiB |
|
Before Width: | Height: | Size: 52 KiB |
|
Before Width: | Height: | Size: 68 KiB |
|
Before Width: | Height: | Size: 106 KiB |
|
Before Width: | Height: | Size: 120 KiB |
|
Before Width: | Height: | Size: 174 KiB |
|
Before Width: | Height: | Size: 170 KiB |
|
Before Width: | Height: | Size: 185 KiB |
|
Before Width: | Height: | Size: 162 KiB |
|
Before Width: | Height: | Size: 161 KiB |
|
Before Width: | Height: | Size: 47 KiB |
|
Before Width: | Height: | Size: 50 KiB |
|
Before Width: | Height: | Size: 59 KiB |
|
Before Width: | Height: | Size: 54 KiB |
|
Before Width: | Height: | Size: 54 KiB |
|
Before Width: | Height: | Size: 55 KiB |
|
Before Width: | Height: | Size: 18 KiB |
|
Before Width: | Height: | Size: 32 KiB |
|
Before Width: | Height: | Size: 102 KiB |
|
Before Width: | Height: | Size: 173 KiB |
|
Before Width: | Height: | Size: 27 KiB |
|
Before Width: | Height: | Size: 109 KiB |
|
Before Width: | Height: | Size: 17 KiB |
|
Before Width: | Height: | Size: 41 KiB |
|
Before Width: | Height: | Size: 32 KiB |
|
Before Width: | Height: | Size: 42 KiB |
|
Before Width: | Height: | Size: 167 KiB |
|
Before Width: | Height: | Size: 17 KiB |
|
Before Width: | Height: | Size: 53 KiB |
|
Before Width: | Height: | Size: 86 KiB |
|
Before Width: | Height: | Size: 88 KiB |
@@ -1,21 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from docutils import nodes
|
||||
from myst_parser.parsers.sphinx_ import MystParser
|
||||
from sphinx.ext.napoleon import docstring
|
||||
|
||||
|
||||
class NapoleonParser(MystParser):
|
||||
|
||||
def parse(self, input_string: str, document: nodes.document) -> None:
|
||||
# Get the Sphinx configuration
|
||||
config = document.settings.env.config
|
||||
|
||||
parsed_content = str(
|
||||
docstring.GoogleDocstring(
|
||||
str(docstring.NumpyDocstring(input_string, config)),
|
||||
config,
|
||||
))
|
||||
return super().parse(parsed_content, document)
|
||||
|
||||
|
||||
Parser = NapoleonParser
|
||||
@@ -1,3 +0,0 @@
|
||||
# vLLM Blog
|
||||
|
||||
vLLM blog posts are published [here](https://blog.vllm.ai/).
|
||||
@@ -1,22 +0,0 @@
|
||||
(meetups)=
|
||||
|
||||
# vLLM Meetups
|
||||
|
||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||
|
||||
- [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
|
||||
- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||
- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||
- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||
- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
|
||||
- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
|
||||
- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
|
||||
- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
|
||||
- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
|
||||
- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing)
|
||||
- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing)
|
||||
- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing)
|
||||
- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
|
||||
- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
|
||||
|
||||
We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
|
||||
@@ -1,39 +0,0 @@
|
||||
# Sponsors
|
||||
|
||||
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||
|
||||
<!-- Note: Please sort them in alphabetical order. -->
|
||||
<!-- Note: Please keep these consistent with README.md. -->
|
||||
|
||||
Cash Donations:
|
||||
|
||||
- a16z
|
||||
- Dropbox
|
||||
- Sequoia Capital
|
||||
- Skywork AI
|
||||
- ZhenFund
|
||||
|
||||
Compute Resources:
|
||||
|
||||
- AMD
|
||||
- Anyscale
|
||||
- AWS
|
||||
- Crusoe Cloud
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Google Cloud
|
||||
- Intel
|
||||
- Lambda Lab
|
||||
- Nebius
|
||||
- Novita AI
|
||||
- NVIDIA
|
||||
- Replicate
|
||||
- Roblox
|
||||
- RunPod
|
||||
- Trainy
|
||||
- UC Berkeley
|
||||
- UC San Diego
|
||||
|
||||
Slack Sponsor: Anyscale
|
||||
|
||||
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||
@@ -1,263 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.append(os.path.abspath(REPO_ROOT))
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'vLLM'
|
||||
copyright = f'{datetime.datetime.now().year}, vLLM Team'
|
||||
author = 'the vLLM Team'
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
"sphinx.ext.napoleon",
|
||||
"sphinx.ext.linkcode",
|
||||
"sphinx.ext.intersphinx",
|
||||
"sphinx_copybutton",
|
||||
"autodoc2",
|
||||
"myst_parser",
|
||||
"sphinxarg.ext",
|
||||
"sphinx_design",
|
||||
"sphinx_togglebutton",
|
||||
]
|
||||
myst_enable_extensions = [
|
||||
"colon_fence",
|
||||
"fieldlist",
|
||||
]
|
||||
autodoc2_packages = [
|
||||
{
|
||||
"path": "../../vllm",
|
||||
"exclude_dirs": ["__pycache__", "third_party"],
|
||||
},
|
||||
]
|
||||
autodoc2_output_dir = "api"
|
||||
autodoc2_render_plugin = "myst"
|
||||
autodoc2_hidden_objects = ["dunder", "private", "inherited"]
|
||||
autodoc2_sort_names = True
|
||||
autodoc2_index_template = None
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]
|
||||
|
||||
# Exclude the prompt "$" when copying code
|
||||
copybutton_prompt_text = r"\$ "
|
||||
copybutton_prompt_is_regexp = True
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_title = project
|
||||
html_theme = 'sphinx_book_theme'
|
||||
html_logo = 'assets/logos/vllm-logo-text-light.png'
|
||||
html_favicon = 'assets/logos/vllm-logo-only-light.ico'
|
||||
html_theme_options = {
|
||||
'path_to_docs': 'docs/source',
|
||||
'repository_url': 'https://github.com/vllm-project/vllm',
|
||||
'use_repository_button': True,
|
||||
'use_edit_page_button': True,
|
||||
# Prevents the full API being added to the left sidebar of every page.
|
||||
# Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB.
|
||||
'collapse_navbar': True,
|
||||
# Makes API visible in the right sidebar on API reference pages.
|
||||
'show_toc_level': 3,
|
||||
}
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ["_static"]
|
||||
html_js_files = ["custom.js"]
|
||||
html_css_files = ["custom.css"]
|
||||
|
||||
myst_heading_anchors = 2
|
||||
myst_url_schemes = {
|
||||
'http': None,
|
||||
'https': None,
|
||||
'mailto': None,
|
||||
'ftp': None,
|
||||
"gh-issue": {
|
||||
"url":
|
||||
"https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}",
|
||||
"title": "Issue #{{path}}",
|
||||
"classes": ["github"],
|
||||
},
|
||||
"gh-pr": {
|
||||
"url":
|
||||
"https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}",
|
||||
"title": "Pull Request #{{path}}",
|
||||
"classes": ["github"],
|
||||
},
|
||||
"gh-project": {
|
||||
"url": "https://github.com/orgs/vllm-project/projects/{{path}}",
|
||||
"title": "Project #{{path}}",
|
||||
"classes": ["github"],
|
||||
},
|
||||
"gh-dir": {
|
||||
"url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
|
||||
"title": "{{path}}",
|
||||
"classes": ["github"],
|
||||
},
|
||||
"gh-file": {
|
||||
"url": "https://github.com/vllm-project/vllm/blob/main/{{path}}",
|
||||
"title": "{{path}}",
|
||||
"classes": ["github"],
|
||||
},
|
||||
}
|
||||
|
||||
# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
|
||||
READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
|
||||
if READTHEDOCS_VERSION_TYPE == "tag":
|
||||
# remove the warning banner if the version is a tagged release
|
||||
header_file = os.path.join(os.path.dirname(__file__),
|
||||
"_templates/sections/header.html")
|
||||
# The file might be removed already if the build is triggered multiple times
|
||||
# (readthedocs build both HTML and PDF versions separately)
|
||||
if os.path.exists(header_file):
|
||||
os.remove(header_file)
|
||||
|
||||
|
||||
# Generate additional rst documentation here.
|
||||
def setup(app):
|
||||
from docs.source.generate_examples import generate_examples
|
||||
generate_examples()
|
||||
|
||||
|
||||
_cached_base: str = ""
|
||||
_cached_branch: str = ""
|
||||
|
||||
|
||||
def get_repo_base_and_branch(pr_number):
|
||||
global _cached_base, _cached_branch
|
||||
if _cached_base and _cached_branch:
|
||||
return _cached_base, _cached_branch
|
||||
|
||||
url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
_cached_base = data['head']['repo']['full_name']
|
||||
_cached_branch = data['head']['ref']
|
||||
return _cached_base, _cached_branch
|
||||
else:
|
||||
logger.error("Failed to fetch PR details: %s", response)
|
||||
return None, None
|
||||
|
||||
|
||||
def linkcode_resolve(domain, info):
|
||||
if domain != 'py':
|
||||
return None
|
||||
if not info['module']:
|
||||
return None
|
||||
|
||||
# Get path from module name
|
||||
file = Path(f"{info['module'].replace('.', '/')}.py")
|
||||
path = REPO_ROOT / file
|
||||
if not path.exists():
|
||||
path = REPO_ROOT / file.with_suffix("") / "__init__.py"
|
||||
if not path.exists():
|
||||
return None
|
||||
|
||||
# Get the line number of the object
|
||||
with open(path) as f:
|
||||
lines = f.readlines()
|
||||
name = info['fullname'].split(".")[-1]
|
||||
pattern = fr"^( {{4}})*((def|class) )?{name}\b.*"
|
||||
for lineno, line in enumerate(lines, 1):
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if re.match(pattern, line):
|
||||
break
|
||||
|
||||
# If the line number is not found, return None
|
||||
if lineno == len(lines):
|
||||
return None
|
||||
|
||||
# If the line number is found, create the URL
|
||||
filename = path.relative_to(REPO_ROOT)
|
||||
if "checkouts" in path.parts:
|
||||
# a PR build on readthedocs
|
||||
pr_number = REPO_ROOT.name
|
||||
base, branch = get_repo_base_and_branch(pr_number)
|
||||
if base and branch:
|
||||
return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
|
||||
# Otherwise, link to the source file on the main branch
|
||||
return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
|
||||
|
||||
|
||||
# Mock out external dependencies here, otherwise sphinx-argparse won't work.
|
||||
autodoc_mock_imports = [
|
||||
"huggingface_hub",
|
||||
"pydantic",
|
||||
"zmq",
|
||||
"cloudpickle",
|
||||
"aiohttp",
|
||||
"starlette",
|
||||
"blake3",
|
||||
"cpuinfo",
|
||||
"transformers",
|
||||
"psutil",
|
||||
"vllm._C",
|
||||
"PIL",
|
||||
"numpy",
|
||||
"tqdm",
|
||||
# The mocks below are required by
|
||||
# docs/source/serving/openai_compatible_server.md's
|
||||
# vllm.entrypoints.openai.cli_args
|
||||
"openai",
|
||||
"fastapi",
|
||||
"partial_json_parser",
|
||||
]
|
||||
|
||||
for mock_target in autodoc_mock_imports:
|
||||
if mock_target in sys.modules:
|
||||
logger.info(
|
||||
"Potentially problematic mock target (%s) found; "
|
||||
"autodoc_mock_imports cannot mock modules that have already "
|
||||
"been loaded into sys.modules when the sphinx build starts.",
|
||||
mock_target)
|
||||
|
||||
intersphinx_mapping = {
|
||||
"python": ("https://docs.python.org/3", None),
|
||||
"typing_extensions":
|
||||
("https://typing-extensions.readthedocs.io/en/latest", None),
|
||||
"aiohttp": ("https://docs.aiohttp.org/en/stable", None),
|
||||
"pillow": ("https://pillow.readthedocs.io/en/stable", None),
|
||||
"numpy": ("https://numpy.org/doc/stable", None),
|
||||
"torch": ("https://pytorch.org/docs/stable", None),
|
||||
"psutil": ("https://psutil.readthedocs.io/en/stable", None),
|
||||
}
|
||||
|
||||
navigation_with_keys = False
|
||||
@@ -1,87 +0,0 @@
|
||||
# Deprecation Policy
|
||||
|
||||
This document outlines the official policy and process for deprecating features
|
||||
in the vLLM project.
|
||||
|
||||
## Overview
|
||||
|
||||
vLLM uses a structured "deprecation pipeline" to guide the lifecycle of
|
||||
deprecated features. This policy ensures that users are given clear and
|
||||
sufficient notice when a feature is deprecated and that deprecations proceed in
|
||||
a consistent and predictable manner.
|
||||
|
||||
We aim to strike a balance between continued innovation and respecting users’
|
||||
reliance on existing functionality. Deprecations are tied to our **minor (Y)
|
||||
releases** following semantic versioning (X.Y.Z), where:
|
||||
|
||||
- **X** is a major version (rare)
|
||||
- **Y** is a minor version (used for significant changes, including deprecations/removals)
|
||||
- **Z** is a patch version (used for fixes and safer enhancements)
|
||||
|
||||
Features that fall under this policy include (at a minimum) the following:
|
||||
|
||||
- CLI flags
|
||||
- Environment variables
|
||||
- Configuration files
|
||||
- APIs in the OpenAI-compatible API server
|
||||
- Public Python APIs for the `vllm` library
|
||||
|
||||
## Deprecation Pipeline
|
||||
|
||||
The deprecation process consists of several clearly defined stages that span
|
||||
multiple Y releases:
|
||||
|
||||
**1. Deprecated (Still On By Default)**
|
||||
|
||||
- **Action**: Feature is marked as deprecated.
|
||||
- **Timeline**: A removal version is explicitly stated in the deprecation
|
||||
warning (e.g., "This will be removed in v0.10.0").
|
||||
- **Communication**: Deprecation is noted in the following, as applicable:
|
||||
- Help strings
|
||||
- Log output
|
||||
- API responses
|
||||
- `/metrics` output (for metrics features)
|
||||
- User-facing documentation
|
||||
- Release notes
|
||||
- GitHub Issue (RFC) for feedback
|
||||
- Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs
|
||||
|
||||
**2.Deprecated (Off By Default)**
|
||||
|
||||
- **Action**: Feature is disabled by default, but can still be re-enabled via a
|
||||
CLI flag or environment variable. Feature throws an error when used without
|
||||
re-enabling.
|
||||
- **Purpose**: Allows users who missed earlier warnings a temporary escape hatch
|
||||
while signaling imminent removal. Ensures any remaining usage is clearly
|
||||
surfaced and blocks silent breakage before full removal.
|
||||
|
||||
**3. Removed**
|
||||
|
||||
- **Action**: Feature is completely removed from the codebase.
|
||||
- **Note**: Only features that have passed through the previous deprecation
|
||||
stages will be removed.
|
||||
|
||||
## Example Timeline
|
||||
|
||||
Assume a feature is deprecated in `v0.9.0`.
|
||||
|
||||
| Release | Status |
|
||||
|---------------|-------------------------------------------------------------------------------------------------|
|
||||
| `v0.9.0` | Feature is deprecated with clear removal version listed. |
|
||||
| `v0.10.0` | Feature is now off by default, throws an error when used, and can be re-enabled for legacy use. |
|
||||
| `v0.11.0` | Feature is removed. |
|
||||
|
||||
## Important Guidelines
|
||||
|
||||
- **No Removals in Patch Releases**: Removing deprecated features in patch
|
||||
(`.Z`) releases is disallowed to avoid surprising users.
|
||||
- **Grace Period for Existing Deprecations**: Any feature deprecated **before
|
||||
this policy** will have its grace period start **now**, not retroactively.
|
||||
- **Documentation is Critical**: Ensure every stage of the pipeline is
|
||||
documented clearly for users.
|
||||
|
||||
## Final Notes
|
||||
|
||||
This policy is a living document and may evolve as the needs of the project and
|
||||
its users change. Community feedback is welcome and encouraged as we refine the
|
||||
process.
|
||||
@@ -1,50 +0,0 @@
|
||||
# Dockerfile
|
||||
|
||||
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
||||
More information about deploying with Docker can be found [here](#deployment-docker).
|
||||
|
||||
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||
|
||||
- All build stages
|
||||
- The default build target (highlighted in grey)
|
||||
- External images (with dashed borders)
|
||||
|
||||
The edges of the build graph represent:
|
||||
|
||||
- `FROM ...` dependencies (with a solid line and a full arrow head)
|
||||
|
||||
- `COPY --from=...` dependencies (with a dashed line and an empty arrow head)
|
||||
|
||||
- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
|
||||
|
||||
> :::{figure} /assets/contributing/dockerfile-stages-dependency.png
|
||||
> :align: center
|
||||
> :alt: query
|
||||
> :width: 100%
|
||||
> :::
|
||||
>
|
||||
> Made using: <https://github.com/patrickhoefler/dockerfilegraph>
|
||||
>
|
||||
> Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
|
||||
>
|
||||
> ```bash
|
||||
> dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
|
||||
> ```
|
||||
>
|
||||
> or in case you want to run it directly with the docker image:
|
||||
>
|
||||
> ```bash
|
||||
> docker run \
|
||||
> --rm \
|
||||
> --user "$(id -u):$(id -g)" \
|
||||
> --workdir /workspace \
|
||||
> --volume "$(pwd)":/workspace \
|
||||
> ghcr.io/patrickhoefler/dockerfilegraph:alpine \
|
||||
> --output png \
|
||||
> --dpi 200 \
|
||||
> --max-label-length 50 \
|
||||
> --filename docker/Dockerfile \
|
||||
> --legend
|
||||
> ```
|
||||
>
|
||||
> (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
|
||||
@@ -1,123 +0,0 @@
|
||||
(new-model-basic)=
|
||||
|
||||
# Implementing a Basic Model
|
||||
|
||||
This guide walks you through the steps to implement a basic vLLM model.
|
||||
|
||||
## 1. Bring your model code
|
||||
|
||||
First, clone the PyTorch model code from the source repository.
|
||||
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
|
||||
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
||||
|
||||
:::{warning}
|
||||
Make sure to review and adhere to the original code's copyright and licensing terms!
|
||||
:::
|
||||
|
||||
## 2. Make your code compatible with vLLM
|
||||
|
||||
To ensure compatibility with vLLM, your model must meet the following requirements:
|
||||
|
||||
### Initialization Code
|
||||
|
||||
All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
|
||||
|
||||
- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
|
||||
- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
|
||||
|
||||
The initialization code should look like this:
|
||||
|
||||
```python
|
||||
from torch import nn
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.attention import Attention
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.attn = Attention(prefix=f"{prefix}.attn")
|
||||
|
||||
class MyDecoderLayer(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
|
||||
|
||||
class MyModel(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList(
|
||||
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
|
||||
)
|
||||
|
||||
class MyModelForCausalLM(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
|
||||
```
|
||||
|
||||
### Computation Code
|
||||
|
||||
- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
|
||||
|
||||
```python
|
||||
class MyModel(nn.Module):
|
||||
...
|
||||
|
||||
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
|
||||
|
||||
```python
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||
:::
|
||||
|
||||
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
|
||||
|
||||
## 3. (Optional) Implement tensor parallelism and quantization support
|
||||
|
||||
If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
|
||||
To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
|
||||
For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
|
||||
When it comes to the linear layers, we provide the following options to parallelize them:
|
||||
|
||||
- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
|
||||
- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
|
||||
- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
|
||||
- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
|
||||
- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
|
||||
|
||||
Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
|
||||
|
||||
## 4. Implement the weight loading logic
|
||||
|
||||
You now need to implement the `load_weights` method in your `*ForCausalLM` class.
|
||||
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
|
||||
|
||||
## 5. Register your model
|
||||
|
||||
See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
|
||||
|
||||
## Frequently Asked Questions
|
||||
|
||||
### How to support models with interleaving sliding windows?
|
||||
|
||||
For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
|
||||
|
||||
To support a model with interleaving sliding windows, we need to take care of the following details:
|
||||
|
||||
- Make sure the model's `config.json` contains `sliding_window_pattern`. vLLM then sets `self.hf_text_config.interleaved_sliding_window` to the value of `self.hf_text_config.sliding_window` and deletes `sliding_window` from `self.hf_text_config`. The model will then be treated as a full-attention model.
|
||||
- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
|
||||
|
||||
With these two steps, interleave sliding windows should work with the model.
|
||||
@@ -1,27 +0,0 @@
|
||||
(new-model)=
|
||||
|
||||
# Adding a New Model
|
||||
|
||||
This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
|
||||
|
||||
:::{toctree}
|
||||
:caption: Contents
|
||||
:maxdepth: 1
|
||||
|
||||
basic
|
||||
registration
|
||||
tests
|
||||
multimodal
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
The complexity of adding a new model depends heavily on the model's architecture.
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
:::
|
||||
|
||||
:::{tip}
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
|
||||
or ask on our [developer slack](https://slack.vllm.ai).
|
||||
We will be happy to help you out!
|
||||
:::
|
||||
@@ -1,834 +0,0 @@
|
||||
(supports-multimodal)=
|
||||
|
||||
# Multi-Modal Support
|
||||
|
||||
This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
|
||||
|
||||
## 1. Update the base vLLM model
|
||||
|
||||
It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
|
||||
Further update the model as follows:
|
||||
|
||||
- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example:
|
||||
|
||||
```diff
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
+ pixel_values: torch.Tensor,
|
||||
) -> SamplerOutput:
|
||||
```
|
||||
|
||||
More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it.
|
||||
|
||||
- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
|
||||
|
||||
```python
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||
|
||||
assert self.vision_encoder is not None
|
||||
image_features = self.vision_encoder(image_input)
|
||||
return self.multi_modal_projector(image_features)
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
|
||||
# Validate the multimodal input keyword arguments
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
return None
|
||||
|
||||
# Run multimodal inputs through encoder and projector
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
return vision_embeddings
|
||||
```
|
||||
|
||||
:::{important}
|
||||
The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
||||
:::
|
||||
|
||||
- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
||||
|
||||
```python
|
||||
from .utils import merge_multimodal_embeddings
|
||||
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
# `get_input_embeddings` should already be implemented for the language
|
||||
# model as one of the requirements of basic vLLM model implementation.
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
|
||||
if multimodal_embeddings is not None:
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids=input_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
multimodal_embeddings=multimodal_embeddings,
|
||||
placeholder_token_id=self.config.image_token_index)
|
||||
|
||||
return inputs_embeds
|
||||
```
|
||||
|
||||
- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
|
||||
|
||||
```python
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
def get_language_model(self) -> torch.nn.Module:
|
||||
# Change `language_model` according to your implementation.
|
||||
return self.language_model
|
||||
```
|
||||
|
||||
- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
||||
|
||||
```diff
|
||||
+ from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||
|
||||
- class YourModelForImage2Seq(nn.Module):
|
||||
+ class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
||||
```
|
||||
|
||||
:::{note}
|
||||
The model class does not have to be named {code}`*ForCausalLM`.
|
||||
Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
|
||||
:::
|
||||
|
||||
## 2. Specify processing information
|
||||
|
||||
Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo`
|
||||
to provide basic information related to HF processing.
|
||||
|
||||
### Maximum number of input items
|
||||
|
||||
You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits`
|
||||
to return the maximum number of input items for each modality supported by the model.
|
||||
|
||||
For example, if the model supports any number of images but only one video per prompt:
|
||||
|
||||
```python
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": 1}
|
||||
```
|
||||
|
||||
## 3. Specify dummy inputs
|
||||
|
||||
Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
|
||||
HF processing as well as memory profiling.
|
||||
|
||||
### For memory profiling
|
||||
|
||||
Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
|
||||
|
||||
Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
|
||||
|
||||
::::{tab-set}
|
||||
:::{tab-item} Basic example: LLaVA
|
||||
:sync: llava
|
||||
|
||||
Looking at the code of HF's `LlavaForConditionalGeneration`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
|
||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||
n_image_features = image_features.shape[0] * image_features.shape[1]
|
||||
|
||||
if n_image_tokens != n_image_features:
|
||||
raise ValueError(
|
||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||
)
|
||||
special_image_mask = (
|
||||
(input_ids == self.config.image_token_index)
|
||||
.unsqueeze(-1)
|
||||
.expand_as(inputs_embeds)
|
||||
.to(inputs_embeds.device)
|
||||
)
|
||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||
```
|
||||
|
||||
The number of placeholder feature tokens per image is `image_features.shape[1]`.
|
||||
`image_features` is calculated inside the `get_image_features` method:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
|
||||
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||
|
||||
selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
|
||||
if vision_feature_select_strategy == "default":
|
||||
selected_image_feature = selected_image_feature[:, 1:]
|
||||
elif vision_feature_select_strategy == "full":
|
||||
selected_image_feature = selected_image_feature
|
||||
else:
|
||||
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
|
||||
image_features = self.multi_modal_projector(selected_image_feature)
|
||||
return image_features
|
||||
```
|
||||
|
||||
We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
|
||||
(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
|
||||
Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
|
||||
The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
|
||||
mechanism doesn't change the sequence length of the output hidden states.
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
|
||||
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
|
||||
hidden_states = self.pre_layrnorm(hidden_states)
|
||||
|
||||
encoder_outputs = self.encoder(
|
||||
inputs_embeds=hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
```
|
||||
|
||||
To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
|
||||
target_dtype = self.patch_embedding.weight.dtype
|
||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
||||
|
||||
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
|
||||
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
|
||||
if interpolate_pos_encoding:
|
||||
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
|
||||
else:
|
||||
embeddings = embeddings + self.position_embedding(self.position_ids)
|
||||
return embeddings
|
||||
```
|
||||
|
||||
We can infer that `embeddings.shape[1] == self.num_positions`, where
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
|
||||
self.num_patches = (self.image_size // self.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
```
|
||||
|
||||
Overall, the number of placeholder feature tokens for an image can be calculated as:
|
||||
|
||||
```python
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
hf_config = self.get_hf_config()
|
||||
hf_processor = self.get_hf_processor()
|
||||
|
||||
image_size = hf_config.vision_config.image_size
|
||||
patch_size = hf_config.vision_config.patch_size
|
||||
|
||||
num_image_tokens = (image_size // patch_size) ** 2 + 1
|
||||
if hf_processor.vision_feature_select_strategy == "default":
|
||||
num_image_tokens -= 1
|
||||
|
||||
return num_image_tokens
|
||||
```
|
||||
|
||||
Notice that the number of image tokens doesn't depend on the image width and height.
|
||||
We can simply use a dummy `image_size` to calculate the multimodal profiling data:
|
||||
|
||||
```python
|
||||
# NOTE: In actuality, this is usually implemented as part of the
|
||||
# model's subclass of `BaseProcessingInfo`, but we show it as is
|
||||
# here for simplicity.
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
hf_config = self.get_hf_config()
|
||||
width = height = hf_config.image_size
|
||||
return ImageSize(width=width, height=height)
|
||||
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
|
||||
For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
|
||||
|
||||
```python
|
||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
processor = self.info.get_hf_processor()
|
||||
image_token = processor.image_token
|
||||
|
||||
return image_token * num_images
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
:::{tab-item} No input placeholders: Fuyu
|
||||
:sync: fuyu
|
||||
|
||||
Looking at the code of HF's `FuyuForCausalLM`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
|
||||
if image_patches is not None and past_key_values is None:
|
||||
patch_embeddings = [
|
||||
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
|
||||
.squeeze(0)
|
||||
.to(inputs_embeds.device)
|
||||
for patch in image_patches
|
||||
]
|
||||
inputs_embeds = self.gather_continuous_embeddings(
|
||||
word_embeddings=inputs_embeds,
|
||||
continuous_embeddings=patch_embeddings,
|
||||
image_patch_input_indices=image_patches_indices,
|
||||
)
|
||||
```
|
||||
|
||||
The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
|
||||
which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
|
||||
|
||||
Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information?
|
||||
Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**.
|
||||
|
||||
The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then
|
||||
`FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`.
|
||||
|
||||
In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
|
||||
returning the dimensions after resizing (but before padding) as metadata.
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
|
||||
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
|
||||
batch_images = image_encoding["images"]
|
||||
image_unpadded_heights = image_encoding["image_unpadded_heights"]
|
||||
image_unpadded_widths = image_encoding["image_unpadded_widths"]
|
||||
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
|
||||
if do_resize:
|
||||
batch_images = [
|
||||
[self.resize(image, size=size, input_data_format=input_data_format) for image in images]
|
||||
for images in batch_images
|
||||
]
|
||||
|
||||
image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
|
||||
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
|
||||
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
|
||||
|
||||
if do_pad:
|
||||
batch_images = [
|
||||
[
|
||||
self.pad_image(
|
||||
image,
|
||||
size=size,
|
||||
mode=padding_mode,
|
||||
constant_values=padding_value,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
for images in batch_images
|
||||
]
|
||||
```
|
||||
|
||||
In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
|
||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=tensor_batch_images,
|
||||
image_present=image_present,
|
||||
image_unpadded_h=image_unpadded_heights,
|
||||
image_unpadded_w=image_unpadded_widths,
|
||||
image_placeholder_id=image_placeholder_id,
|
||||
image_newline_id=image_newline_id,
|
||||
variable_sized=True,
|
||||
)
|
||||
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
|
||||
image_height, image_width = image.shape[1], image.shape[2]
|
||||
if variable_sized: # variable_sized=True
|
||||
new_h = min(
|
||||
image_height,
|
||||
math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
|
||||
)
|
||||
new_w = min(
|
||||
image_width,
|
||||
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
|
||||
)
|
||||
image = image[:, :new_h, :new_w]
|
||||
image_height, image_width = new_h, new_w
|
||||
|
||||
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
|
||||
tensor_of_image_ids = torch.full(
|
||||
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
|
||||
)
|
||||
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
|
||||
assert num_patches == patches.shape[0]
|
||||
```
|
||||
|
||||
The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
|
||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
|
||||
|
||||
if image_height % patch_height != 0:
|
||||
raise ValueError(f"{image_height=} must be divisible by {patch_height}")
|
||||
if image_width % patch_width != 0:
|
||||
raise ValueError(f"{image_width=} must be divisible by {patch_width}")
|
||||
|
||||
num_patches_per_dim_h = image_height // patch_height
|
||||
num_patches_per_dim_w = image_width // patch_width
|
||||
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
|
||||
```
|
||||
|
||||
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
|
||||
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
|
||||
|
||||
```python
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
image_processor = self.get_image_processor()
|
||||
return ImageSize(width=image_processor.size["width"],
|
||||
height=image_processor.size["height"])
|
||||
```
|
||||
|
||||
Fuyu does not expect image placeholders in the inputs to HF processor, so
|
||||
the dummy prompt text is empty regardless of the number of images.
|
||||
|
||||
```python
|
||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||
return ""
|
||||
```
|
||||
|
||||
For the multimodal image profiling data, the logic is very similar to LLaVA:
|
||||
|
||||
```python
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
## 4. Specify processing details
|
||||
|
||||
Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
|
||||
to fill in the missing details about HF processing.
|
||||
|
||||
:::{seealso}
|
||||
[Multi-Modal Data Processing](#mm-processing)
|
||||
:::
|
||||
|
||||
### Multi-modal fields
|
||||
|
||||
Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
|
||||
return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
|
||||
|
||||
:::::{tab-set}
|
||||
::::{tab-item} Basic example: LLaVA
|
||||
:sync: llava
|
||||
|
||||
The output of `CLIPImageProcessor` is a simple tensor with shape
|
||||
`(num_images, num_channels, image_height, image_width)`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
for image in all_images
|
||||
]
|
||||
|
||||
data = {"pixel_values": images}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
```
|
||||
|
||||
So, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows:
|
||||
|
||||
```python
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(
|
||||
pixel_values=MultiModalFieldConfig.batched("image"),
|
||||
)
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
|
||||
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} With postprocessing: Fuyu
|
||||
:sync: fuyu
|
||||
|
||||
The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates
|
||||
the patches from each image belonging to an item in the batch:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679
|
||||
image_input_ids.append(tensor_of_image_ids)
|
||||
image_patches.append(patches)
|
||||
else:
|
||||
image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
|
||||
|
||||
batch_image_input_ids.append(image_input_ids)
|
||||
batch_image_patches.append(image_patches)
|
||||
```
|
||||
|
||||
The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
|
||||
`(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
|
||||
|
||||
In order to support the use of {func}`MultiModalFieldConfig.batched` like in LLaVA,
|
||||
we remove the extra batch dimension by overriding {meth}`BaseMultiModalProcessor._call_hf_processor`:
|
||||
|
||||
```python
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
)
|
||||
|
||||
image_patches = processed_outputs.get("image_patches")
|
||||
if image_patches is not None:
|
||||
images = mm_data["images"]
|
||||
assert isinstance(images, list)
|
||||
|
||||
# Original output: (1, num_images, Pn, Px * Py * C)
|
||||
# New output: (num_images, Pn, Px * Py * C)
|
||||
assert (isinstance(image_patches, list)
|
||||
and len(image_patches) == 1)
|
||||
assert (isinstance(image_patches[0], torch.Tensor)
|
||||
and len(image_patches[0]) == len(images))
|
||||
|
||||
processed_outputs["image_patches"] = image_patches[0]
|
||||
|
||||
return processed_outputs
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
||||
for text-only inputs to prevent unnecessary warnings from HF processor.
|
||||
:::
|
||||
|
||||
This lets us override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows:
|
||||
|
||||
```python
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(image_patches=MultiModalFieldConfig.batched("image"))
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
### Prompt updates
|
||||
|
||||
Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` to
|
||||
return a list of {class}`~vllm.multimodal.processing.PromptUpdate` instances.
|
||||
|
||||
Each {class}`~vllm.multimodal.processing.PromptUpdate` instance specifies an update operation
|
||||
(e.g.: insertion, replacement) performed by the HF processor.
|
||||
|
||||
::::{tab-set}
|
||||
:::{tab-item} Basic example: LLaVA
|
||||
:sync: llava
|
||||
|
||||
Looking at HF's `LlavaProcessor`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
|
||||
prompt_strings = []
|
||||
for sample in text:
|
||||
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
||||
prompt_strings.append(sample)
|
||||
```
|
||||
|
||||
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
|
||||
Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` as follows:
|
||||
|
||||
```python
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
|
||||
def get_replacement(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
|
||||
image_size = images.get_image_size(item_idx)
|
||||
num_image_tokens = self.info.get_num_image_tokens(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
|
||||
return [image_token_id] * num_image_tokens
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[image_token_id],
|
||||
replacement=get_replacement,
|
||||
),
|
||||
]
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
:::{tab-item} Handling additional tokens: Fuyu
|
||||
:sync: fuyu
|
||||
|
||||
Recall the layout of feature tokens from Step 2:
|
||||
|
||||
```
|
||||
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|
||||
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|
||||
...
|
||||
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|
||||
```
|
||||
|
||||
We define a helper function to return `ncols` and `nrows` directly:
|
||||
|
||||
```python
|
||||
def get_image_feature_grid_size(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> tuple[int, int]:
|
||||
image_processor = self.get_image_processor()
|
||||
target_width = image_processor.size["width"]
|
||||
target_height = image_processor.size["height"]
|
||||
patch_width = image_processor.patch_size["width"]
|
||||
patch_height = image_processor.patch_size["height"]
|
||||
|
||||
if not (image_width <= target_width and image_height <= target_height):
|
||||
height_scale_factor = target_height / image_height
|
||||
width_scale_factor = target_width / image_width
|
||||
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
|
||||
|
||||
image_height = int(image_height * optimal_scale_factor)
|
||||
image_width = int(image_width * optimal_scale_factor)
|
||||
|
||||
ncols = math.ceil(image_width / patch_width)
|
||||
nrows = math.ceil(image_height / patch_height)
|
||||
return ncols, nrows
|
||||
```
|
||||
|
||||
Based on this, we can initially define our replacement tokens as:
|
||||
|
||||
```python
|
||||
def get_replacement(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
|
||||
# `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
|
||||
# `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
|
||||
return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
```
|
||||
|
||||
However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
|
||||
a BOS token (`<s>`) is also added to the promopt:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
|
||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=tensor_batch_images,
|
||||
image_present=image_present,
|
||||
image_unpadded_h=image_unpadded_heights,
|
||||
image_unpadded_w=image_unpadded_widths,
|
||||
image_placeholder_id=image_placeholder_id,
|
||||
image_newline_id=image_newline_id,
|
||||
variable_sized=True,
|
||||
)
|
||||
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
|
||||
tokenizer=self.tokenizer,
|
||||
prompts=prompts,
|
||||
scale_factors=scale_factors,
|
||||
max_tokens_to_generate=self.max_tokens_to_generate,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
add_BOS=True,
|
||||
add_beginning_of_answer_token=True,
|
||||
)
|
||||
```
|
||||
|
||||
To assign the vision embeddings to only the image tokens, instead of a string
|
||||
you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
|
||||
|
||||
```python
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id # `<s>`
|
||||
assert isinstance(bos_token_id, int)
|
||||
|
||||
def get_replacement_fuyu(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=_IMAGE_TOKEN_ID,
|
||||
)
|
||||
```
|
||||
|
||||
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
|
||||
we can search for it to conduct the replacement at the start of the string:
|
||||
|
||||
```python
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id
|
||||
assert isinstance(bos_token_id, int)
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
eot_token_id = tokenizer.bos_token_id
|
||||
assert isinstance(eot_token_id, int)
|
||||
|
||||
def get_replacement_fuyu(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=_IMAGE_TOKEN_ID,
|
||||
)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[eot_token_id],
|
||||
replacement=get_replacement_fuyu,
|
||||
)
|
||||
]
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
## 5. Register processor-related classes
|
||||
|
||||
After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2),
|
||||
{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3),
|
||||
and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4),
|
||||
decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
|
||||
to register them to the multi-modal registry:
|
||||
|
||||
```diff
|
||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||
+ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder)
|
||||
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
### Inserting feature tokens without replacement
|
||||
|
||||
Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use {class}`~vllm.multimodal.processing.PromptInsertion` instead of {class}`~vllm.multimodal.processing.PromptReplacement` inside {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
|
||||
|
||||
Examples:
|
||||
|
||||
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
|
||||
- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
|
||||
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
|
||||
### Handling prompt updates unrelated to multi-modal data
|
||||
|
||||
{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only` so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](#mm-processing).
|
||||
|
||||
Examples:
|
||||
|
||||
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
|
||||
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
|
||||
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
|
||||
### Custom HF processor
|
||||
|
||||
Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor`.
|
||||
|
||||
Examples:
|
||||
|
||||
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
|
||||
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
|
||||
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
|
||||
@@ -1,55 +0,0 @@
|
||||
(new-model-registration)=
|
||||
|
||||
# Registering a Model to vLLM
|
||||
|
||||
vLLM relies on a model registry to determine how to run each model.
|
||||
A list of pre-registered architectures can be found [here](#supported-models).
|
||||
|
||||
If your model is not on this list, you must register it to vLLM.
|
||||
This page provides detailed instructions on how to do so.
|
||||
|
||||
## Built-in models
|
||||
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
|
||||
This gives you the ability to modify the codebase and test your model.
|
||||
|
||||
After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
||||
Finally, update our [list of supported models](#supported-models) to promote your model!
|
||||
|
||||
:::{important}
|
||||
The list of models in each section should be maintained in alphabetical order.
|
||||
:::
|
||||
|
||||
## Out-of-tree models
|
||||
|
||||
You can load an external model using a plugin without modifying the vLLM codebase.
|
||||
|
||||
:::{seealso}
|
||||
[vLLM's Plugin System](#plugin-system)
|
||||
:::
|
||||
|
||||
To register the model, use the following code:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
from your_code import YourModelForCausalLM
|
||||
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
||||
```
|
||||
|
||||
If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
|
||||
ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
|
||||
```
|
||||
|
||||
:::{important}
|
||||
If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
||||
Read more about that [here](#supports-multimodal).
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
|
||||
:::
|
||||
@@ -1,63 +0,0 @@
|
||||
(new-model-tests)=
|
||||
|
||||
# Writing Unit Tests
|
||||
|
||||
This page explains how to write unit tests to verify the implementation of your model.
|
||||
|
||||
## Required Tests
|
||||
|
||||
These tests are necessary to get your PR merged into vLLM library.
|
||||
Without them, the CI for your PR will fail.
|
||||
|
||||
### Model loading
|
||||
|
||||
Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
|
||||
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
|
||||
|
||||
:::{important}
|
||||
The list of models in each section should be maintained in alphabetical order.
|
||||
:::
|
||||
|
||||
:::{tip}
|
||||
If your model requires a development version of HF Transformers, you can set
|
||||
`min_transformers_version` to skip the test in CI until the model is released.
|
||||
:::
|
||||
|
||||
## Optional Tests
|
||||
|
||||
These tests are optional to get your PR merged into vLLM library.
|
||||
Passing these tests provides more confidence that your implementation is correct, and helps avoid future regressions.
|
||||
|
||||
### Model correctness
|
||||
|
||||
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
|
||||
|
||||
#### Generative models
|
||||
|
||||
For [generative models](#generative-models), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
|
||||
|
||||
- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
|
||||
- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
|
||||
|
||||
#### Pooling models
|
||||
|
||||
For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in <gh-file:tests/models/embedding/utils.py>.
|
||||
|
||||
(mm-processing-tests)=
|
||||
|
||||
### Multi-modal processing
|
||||
|
||||
#### Common tests
|
||||
|
||||
Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
|
||||
|
||||
- Text + multi-modal data
|
||||
- Tokens + multi-modal data
|
||||
- Text + cached multi-modal data
|
||||
- Tokens + cached multi-modal data
|
||||
|
||||
#### Model-specific tests
|
||||
|
||||
You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
|
||||
|
||||
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
|
||||
@@ -1,180 +0,0 @@
|
||||
# Contributing to vLLM
|
||||
|
||||
Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
|
||||
|
||||
- Identify and report any issues or bugs.
|
||||
- Request or add support for a new model.
|
||||
- Suggest or implement new features.
|
||||
- Improve documentation or contribute a how-to guide.
|
||||
|
||||
We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
|
||||
|
||||
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
|
||||
|
||||
## Job Board
|
||||
|
||||
Unsure on where to start? Check out the following links for tasks to work on:
|
||||
|
||||
- [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
|
||||
- [Selected onboarding tasks](gh-project:6)
|
||||
- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22)
|
||||
- [Models with multi-modal capabilities](gh-project:10)
|
||||
|
||||
## License
|
||||
|
||||
See <gh-file:LICENSE>.
|
||||
|
||||
## Developing
|
||||
|
||||
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
|
||||
Check out the [building from source](#build-from-source) documentation for details.
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
pip install -r requirements/dev.txt
|
||||
|
||||
# Linting, formatting and static type checking
|
||||
pre-commit install --hook-type pre-commit --hook-type commit-msg
|
||||
|
||||
# You can manually run pre-commit with
|
||||
pre-commit run --all-files
|
||||
|
||||
# To manually run something from CI that does not run
|
||||
# locally by default, you can run:
|
||||
pre-commit run mypy-3.9 --hook-stage manual --all-files
|
||||
|
||||
# Unit tests
|
||||
pytest tests/
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
|
||||
|
||||
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
Currently, the repository is not fully checked by `mypy`.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
|
||||
platform to run unit tests locally, rely on the continuous integration system to run the tests for
|
||||
now.
|
||||
:::
|
||||
|
||||
## Issues
|
||||
|
||||
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||
|
||||
:::{important}
|
||||
If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
|
||||
:::
|
||||
|
||||
## Pull Requests & Code Reviews
|
||||
|
||||
Thank you for your contribution to vLLM! Before submitting the pull request,
|
||||
please ensure the PR meets the following criteria. This helps vLLM maintain the
|
||||
code quality and improve the efficiency of the review process.
|
||||
|
||||
### DCO and Signed-off-by
|
||||
|
||||
When contributing changes to this project, you must agree to the <gh-file:DCO>.
|
||||
Commits must include a `Signed-off-by:` header which certifies agreement with
|
||||
the terms of the DCO.
|
||||
|
||||
Using `-s` with `git commit` will automatically add this header.
|
||||
|
||||
### PR Title and Classification
|
||||
|
||||
Only specific types of PRs will be reviewed. The PR title is prefixed
|
||||
appropriately to indicate the type of change. Please use one of the following:
|
||||
|
||||
- `[Bugfix]` for bug fixes.
|
||||
- `[CI/Build]` for build or continuous integration improvements.
|
||||
- `[Doc]` for documentation fixes and improvements.
|
||||
- `[Model]` for adding a new model or improving an existing model. Model name
|
||||
should appear in the title.
|
||||
- `[Frontend]` For changes on the vLLM frontend (e.g., OpenAI API server,
|
||||
`LLM` class, etc.)
|
||||
- `[Kernel]` for changes affecting CUDA kernels or other compute kernels.
|
||||
- `[Core]` for changes in the core vLLM logic (e.g., `LLMEngine`,
|
||||
`AsyncLLMEngine`, `Scheduler`, etc.)
|
||||
- `[Hardware][Vendor]` for hardware-specific changes. Vendor name should
|
||||
appear in the prefix (e.g., `[Hardware][AMD]`).
|
||||
- `[Misc]` for PRs that do not fit the above categories. Please use this
|
||||
sparingly.
|
||||
|
||||
:::{note}
|
||||
If the PR spans more than one category, please include all relevant prefixes.
|
||||
:::
|
||||
|
||||
### Code Quality
|
||||
|
||||
The PR needs to meet the following code quality standards:
|
||||
|
||||
- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
|
||||
- Pass all linter checks. Please use `pre-commit` to format your code. See
|
||||
<https://pre-commit.com/#usage> if `pre-commit` is new to you.
|
||||
- The code needs to be well-documented to ensure future contributors can easily
|
||||
understand the code.
|
||||
- Include sufficient tests to ensure the project stays correct and robust. This
|
||||
includes both unit tests and integration tests.
|
||||
- Please add documentation to `docs/source/` if the PR modifies the
|
||||
user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
|
||||
new features or changes.
|
||||
|
||||
### Adding or Changing Kernels
|
||||
|
||||
Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
|
||||
|
||||
- Make sure custom ops are registered following PyTorch guidelines:
|
||||
[Custom C++ and CUDA Operators](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial)
|
||||
and [The Custom Operators Manual](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU).
|
||||
- Custom operations that return `Tensors` require meta-functions.
|
||||
Meta-functions should be implemented and registered in Python so that dynamic
|
||||
dims can be handled automatically. See above documents for a description of
|
||||
meta-functions.
|
||||
- Use [torch.library.opcheck()](https://pytorch.org/docs/stable/library.html#torch.library.opcheck)
|
||||
to test the function registration and meta-function for any registered ops.
|
||||
See `tests/kernels` for examples.
|
||||
- When changing the C++ signature of an existing op, the schema must be updated
|
||||
to reflect the changes.
|
||||
- If a new custom type is needed, see the following document:
|
||||
[Custom Class Support in PT2](https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA).
|
||||
|
||||
### Notes for Large Changes
|
||||
|
||||
Please keep the changes as concise as possible. For major architectural changes
|
||||
(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
|
||||
(RFC) discussing the technical design and justification. Otherwise, we will tag
|
||||
it with `rfc-required` and might not go through the PR.
|
||||
|
||||
### What to Expect for the Reviews
|
||||
|
||||
The goal of the vLLM team is to be a *transparent reviewing machine*. We would
|
||||
like to make the review process transparent and efficient and make sure no
|
||||
contributor feels confused or frustrated. However, the vLLM team is small, so we
|
||||
need to prioritize some PRs over others. Here is what you can expect from the
|
||||
review process:
|
||||
|
||||
- After the PR is submitted, the PR will be assigned to a reviewer. Every
|
||||
reviewer will pick up the PRs based on their expertise and availability.
|
||||
- After the PR is assigned, the reviewer will provide status updates every 2-3
|
||||
days. If the PR is not reviewed within 7 days, please feel free to ping the
|
||||
reviewer or the vLLM team.
|
||||
- After the review, the reviewer will put an `action-required` label on the PR
|
||||
if there are changes required. The contributor should address the comments and
|
||||
ping the reviewer to re-review the PR.
|
||||
- Please respond to all comments within a reasonable time frame. If a comment
|
||||
isn't clear or you disagree with a suggestion, feel free to ask for
|
||||
clarification or discuss the suggestion.
|
||||
- Note that not all CI checks will be executed due to limited computational
|
||||
resources. The reviewer will add `ready` label to the PR when the PR is
|
||||
ready to merge or a full CI run is needed.
|
||||
|
||||
## Thank You
|
||||
|
||||
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
|
||||
All of your contributions help make vLLM a great tool and community for everyone!
|
||||
@@ -1,175 +0,0 @@
|
||||
# Profiling vLLM
|
||||
|
||||
:::{warning}
|
||||
Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
|
||||
:::
|
||||
|
||||
## Profile with PyTorch Profiler
|
||||
|
||||
We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
|
||||
|
||||
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
|
||||
|
||||
When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
|
||||
|
||||
Traces can be visualized using <https://ui.perfetto.dev/>.
|
||||
|
||||
:::{tip}
|
||||
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
||||
:::
|
||||
|
||||
:::{tip}
|
||||
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
|
||||
Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
|
||||
`export VLLM_RPC_TIMEOUT=1800000`
|
||||
:::
|
||||
|
||||
### Example commands and usage
|
||||
|
||||
#### Offline Inference
|
||||
|
||||
Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
|
||||
|
||||
#### OpenAI Server
|
||||
|
||||
```bash
|
||||
VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
|
||||
```
|
||||
|
||||
benchmark_serving.py:
|
||||
|
||||
```bash
|
||||
python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
|
||||
```
|
||||
|
||||
## Profile with NVIDIA Nsight Systems
|
||||
|
||||
Nsight systems is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
|
||||
|
||||
[Install nsight-systems](https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html) using your package manager.
|
||||
The following block is an example for Ubuntu.
|
||||
|
||||
```bash
|
||||
apt update
|
||||
apt install -y --no-install-recommends gnupg
|
||||
echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
|
||||
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
|
||||
apt update
|
||||
apt install nsight-systems-cli
|
||||
```
|
||||
|
||||
### Example commands and usage
|
||||
|
||||
#### Offline Inference
|
||||
|
||||
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
|
||||
|
||||
The following is an example using the `benchmarks/benchmark_latency.py` script:
|
||||
|
||||
```bash
|
||||
nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8
|
||||
```
|
||||
|
||||
#### OpenAI Server
|
||||
|
||||
To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed.
|
||||
|
||||
```bash
|
||||
# server
|
||||
nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||
|
||||
# client
|
||||
python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512
|
||||
```
|
||||
|
||||
In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
|
||||
|
||||
```
|
||||
nsys sessions list
|
||||
```
|
||||
|
||||
to get the session id in the form of `profile-XXXXX`, then run:
|
||||
|
||||
```
|
||||
nsys stop --session=profile-XXXXX
|
||||
```
|
||||
|
||||
to manually kill the profiler and generate your `nsys-rep` report.
|
||||
|
||||
#### Analysis
|
||||
|
||||
You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
|
||||
|
||||
CLI example:
|
||||
|
||||
```bash
|
||||
nsys stats report1.nsys-rep
|
||||
...
|
||||
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
|
||||
|
||||
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
|
||||
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
|
||||
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
|
||||
12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
|
||||
9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
|
||||
5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
|
||||
4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
|
||||
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
|
||||
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
|
||||
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
|
||||
...
|
||||
```
|
||||
|
||||
GUI example:
|
||||
|
||||
<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
|
||||
|
||||
## Profiling vLLM Python Code
|
||||
|
||||
The Python standard library includes
|
||||
[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
|
||||
code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
|
||||
Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
|
||||
used to profile a section of code.
|
||||
|
||||
### Example usage - decorator
|
||||
|
||||
The first helper is a Python decorator that can be used to profile a function.
|
||||
If a filename is specified, the profile will be saved to that file. If no filename is
|
||||
specified, profile data will be printed to stdout.
|
||||
|
||||
```python
|
||||
import vllm.utils
|
||||
|
||||
@vllm.utils.cprofile("expensive_function.prof")
|
||||
def expensive_function():
|
||||
# some expensive code
|
||||
pass
|
||||
```
|
||||
|
||||
### Example Usage - context manager
|
||||
|
||||
The second helper is a context manager that can be used to profile a block of
|
||||
code. Similar to the decorator, the filename is optional.
|
||||
|
||||
```python
|
||||
import vllm.utils
|
||||
|
||||
def another_function():
|
||||
# more expensive code
|
||||
pass
|
||||
|
||||
with vllm.utils.cprofile_context("another_function.prof"):
|
||||
another_function()
|
||||
```
|
||||
|
||||
### Analyzing Profile Results
|
||||
|
||||
There are multiple tools available that can help analyze the profile results.
|
||||
One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
|
||||
|
||||
```bash
|
||||
pip install snakeviz
|
||||
snakeviz expensive_function.prof
|
||||
```
|
||||
@@ -1,60 +0,0 @@
|
||||
# Vulnerability Management
|
||||
|
||||
## Reporting Vulnerabilities
|
||||
|
||||
As mentioned in the [security
|
||||
policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
|
||||
vulnerabilities may be reported privately to the project via
|
||||
[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||
|
||||
## Vulnerability Management Team
|
||||
|
||||
Once a vulnerability has been reported to the project, the Vulnerability
|
||||
Management Team (VMT) is responsible for managing the vulnerability. The VMT is
|
||||
responsible for:
|
||||
|
||||
- Triaging the vulnerability.
|
||||
- Coordinating with reporters and project maintainers on vulnerability analysis
|
||||
and resolution.
|
||||
- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
|
||||
- Coordination with project maintainers on a coordinated release of the fix and
|
||||
security advisory.
|
||||
|
||||
### Security Advisories
|
||||
|
||||
Advisories are published via GitHub through the same system used to report
|
||||
vulnerabilities. More information on the process can be found in the [GitHub
|
||||
documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
|
||||
|
||||
### Team Members
|
||||
|
||||
We prefer to keep all vulnerability-related communication on the security report
|
||||
on GitHub. However, if you need to contact the VMT directly for an urgent issue,
|
||||
you may contact the following individuals:
|
||||
|
||||
- Simon Mo - simon.mo@hey.com
|
||||
- Russell Bryant - rbryant@redhat.com
|
||||
|
||||
## Slack Discussion
|
||||
|
||||
You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
|
||||
to discuss security-related topics. However, please do not disclose any
|
||||
vulnerabilities in this channel. If you need to report a vulnerability, please
|
||||
use the GitHub security advisory system or contact a VMT member privately.
|
||||
|
||||
## Vulnerability Disclosure
|
||||
|
||||
The process for disclosing vulnerabilities is the following:
|
||||
|
||||
- The VMT will work with the project maintainers to develop a fix for the
|
||||
vulnerability.
|
||||
- The VMT will coordinate with the reporter and project maintainers to prepare a
|
||||
security advisory that adequately describes the vulnerability and its impact.
|
||||
- The VMT will coordinate with the project maintainers to publish a fix and
|
||||
release an update that includes that fix.
|
||||
- The VMT will publish the security advisory on GitHub. Release notes will be
|
||||
updated to include a reference to the security advisory.
|
||||
|
||||
The VMT and project maintainers will work to minimize the amount of time in
|
||||
between disclosing any public information about the vulnerability and making a
|
||||
release and advisory available.
|
||||
@@ -1,133 +0,0 @@
|
||||
(deployment-docker)=
|
||||
|
||||
# Using Docker
|
||||
|
||||
(deployment-docker-pre-built-image)=
|
||||
|
||||
## Use vLLM's Official Docker Image
|
||||
|
||||
vLLM offers an official Docker image for deployment.
|
||||
The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
|
||||
|
||||
```console
|
||||
$ docker run --runtime nvidia --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
-p 8000:8000 \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--model mistralai/Mistral-7B-v0.1
|
||||
```
|
||||
|
||||
This image can also be used with other container engines such as [Podman](https://podman.io/).
|
||||
|
||||
```console
|
||||
$ podman run --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
-p 8000:8000 \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--model mistralai/Mistral-7B-v0.1
|
||||
```
|
||||
|
||||
You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
|
||||
|
||||
:::{note}
|
||||
You can either use the `ipc=host` flag or `--shm-size` flag to allow the
|
||||
container to access the host's shared memory. vLLM uses PyTorch, which uses shared
|
||||
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
|
||||
|
||||
If you need to use those dependencies (having accepted the license terms),
|
||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:v0.8.3
|
||||
|
||||
# e.g. install the `audio` optional dependencies
|
||||
# NOTE: Make sure the version of vLLM matches the base image!
|
||||
RUN uv pip install --system vllm[audio]==0.8.3
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
:::{tip}
|
||||
Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
|
||||
|
||||
To use the development version of `transformers`, create a custom Dockerfile on top of the base image
|
||||
with an extra layer that installs their code from source:
|
||||
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:latest
|
||||
|
||||
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
(deployment-docker-build-image-from-source)=
|
||||
|
||||
## Building vLLM's Docker Image from Source
|
||||
|
||||
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
|
||||
|
||||
```console
|
||||
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
||||
DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
|
||||
```
|
||||
|
||||
:::{note}
|
||||
By default vLLM will build for all GPU types for widest distribution. If you are just building for the
|
||||
current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
|
||||
for vLLM to find the current GPU type and build for that.
|
||||
|
||||
If you are using Podman instead of Docker, you might need to disable SELinux labeling by
|
||||
adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
|
||||
:::
|
||||
|
||||
## Building for Arm64/aarch64
|
||||
|
||||
A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
|
||||
of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
|
||||
|
||||
:::{note}
|
||||
Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
|
||||
flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
|
||||
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
|
||||
:::
|
||||
|
||||
```console
|
||||
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
|
||||
$ python3 use_existing_torch.py
|
||||
$ DOCKER_BUILDKIT=1 docker build . \
|
||||
--file docker/Dockerfile \
|
||||
--target vllm-openai \
|
||||
--platform "linux/arm64" \
|
||||
-t vllm/vllm-gh200-openai:latest \
|
||||
--build-arg max_jobs=66 \
|
||||
--build-arg nvcc_threads=2 \
|
||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||
```
|
||||
|
||||
## Use the custom-built vLLM Docker image
|
||||
|
||||
To run vLLM with the custom-built Docker image:
|
||||
|
||||
```console
|
||||
$ docker run --runtime nvidia --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
vllm/vllm-openai <args...>
|
||||
```
|
||||
|
||||
The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
|
||||
|
||||
:::{note}
|
||||
**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
|
||||
:::
|
||||
@@ -1,47 +0,0 @@
|
||||
(deployment-anything-llm)=
|
||||
|
||||
# Anything LLM
|
||||
|
||||
[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
|
||||
|
||||
It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM environment
|
||||
|
||||
## Deploy
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
|
||||
```
|
||||
|
||||
- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
|
||||
|
||||
- On the bottom left of open settings, AI Prooviders --> LLM:
|
||||
- LLM Provider: Generic OpenAI
|
||||
- Base URL: http://{vllm server host}:{vllm server port}/v1
|
||||
- Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-provider.png
|
||||
:::
|
||||
|
||||
- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-chat-without-doc.png
|
||||
:::
|
||||
|
||||
- Click the upload button:
|
||||
- upload the doc
|
||||
- select the doc and move to the workspace
|
||||
- save and embed
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-upload-doc.png
|
||||
:::
|
||||
|
||||
- Chat again:
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-chat-with-doc.png
|
||||
:::
|
||||
@@ -1,7 +0,0 @@
|
||||
(deployment-bentoml)=
|
||||
|
||||
# BentoML
|
||||
|
||||
[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
|
||||
|
||||
For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
|
||||
@@ -1,109 +0,0 @@
|
||||
(deployment-cerebrium)=
|
||||
|
||||
# Cerebrium
|
||||
|
||||
:::{raw} html
|
||||
<p align="center">
|
||||
<img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
|
||||
</p>
|
||||
:::
|
||||
|
||||
vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
|
||||
|
||||
To install the Cerebrium client, run:
|
||||
|
||||
```console
|
||||
pip install cerebrium
|
||||
cerebrium login
|
||||
```
|
||||
|
||||
Next, create your Cerebrium project, run:
|
||||
|
||||
```console
|
||||
cerebrium init vllm-project
|
||||
```
|
||||
|
||||
Next, to install the required packages, add the following to your cerebrium.toml:
|
||||
|
||||
```toml
|
||||
[cerebrium.deployment]
|
||||
docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
|
||||
|
||||
[cerebrium.dependencies.pip]
|
||||
vllm = "latest"
|
||||
```
|
||||
|
||||
Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
|
||||
|
||||
def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
|
||||
|
||||
sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
results = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
results.append({"prompt": prompt, "generated_text": generated_text})
|
||||
|
||||
return {"results": results}
|
||||
```
|
||||
|
||||
Then, run the following code to deploy it to the cloud:
|
||||
|
||||
```console
|
||||
cerebrium deploy
|
||||
```
|
||||
|
||||
If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
|
||||
|
||||
```python
|
||||
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: <JWT TOKEN>' \
|
||||
--data '{
|
||||
"prompts": [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is"
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
You should get a response like:
|
||||
|
||||
```python
|
||||
{
|
||||
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
|
||||
"result": {
|
||||
"result": [
|
||||
{
|
||||
"prompt": "Hello, my name is",
|
||||
"generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
|
||||
},
|
||||
{
|
||||
"prompt": "The president of the United States is",
|
||||
"generated_text": " elected every four years. This is a democratic system.\n\n5. What"
|
||||
},
|
||||
{
|
||||
"prompt": "The capital of France is",
|
||||
"generated_text": " Paris.\n"
|
||||
},
|
||||
{
|
||||
"prompt": "The future of AI is",
|
||||
"generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
|
||||
}
|
||||
]
|
||||
},
|
||||
"run_time_ms": 152.53663063049316
|
||||
}
|
||||
```
|
||||
|
||||
You now have an autoscaling endpoint where you only pay for the compute you use!
|
||||
@@ -1,36 +0,0 @@
|
||||
(deployment-chatbox)=
|
||||
|
||||
# Chatbox
|
||||
|
||||
[Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux.
|
||||
|
||||
It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM environment
|
||||
|
||||
## Deploy
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
- Download and install [Chatbox desktop](https://chatboxai.app/en#download).
|
||||
|
||||
- On the bottom left of settings, Add Custom Provider
|
||||
- API Mode: `OpenAI API Compatible`
|
||||
- Name: vllm
|
||||
- API Host: `http://{vllm server host}:{vllm server port}/v1`
|
||||
- API Path: `/chat/completions`
|
||||
- Model: `qwen/Qwen1.5-0.5B-Chat`
|
||||
|
||||
:::{image} /assets/deployment/chatbox-settings.png
|
||||
:::
|
||||
|
||||
- Go to `Just chat`, and start to chat:
|
||||
|
||||
:::{image} /assets/deployment/chatbox-chat.png
|
||||
:::
|
||||
@@ -1,56 +0,0 @@
|
||||
(deployment-dify)=
|
||||
|
||||
# Dify
|
||||
|
||||
[Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production.
|
||||
|
||||
It supports vLLM as a model provider to efficiently serve large language models.
|
||||
|
||||
This guide walks you through deploying Dify using a vLLM backend.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM environment
|
||||
- Install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/)
|
||||
|
||||
## Deploy
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve Qwen/Qwen1.5-7B-Chat
|
||||
```
|
||||
|
||||
- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
|
||||
|
||||
```console
|
||||
git clone https://github.com/langgenius/dify.git
|
||||
cd dify
|
||||
cd docker
|
||||
cp .env.example .env
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
- Open the browser to access `http://localhost/install`, config the basic login information and login.
|
||||
|
||||
- In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
|
||||
|
||||
- Fill in the model provider details as follows:
|
||||
- **Model Type**: `LLM`
|
||||
- **Model Name**: `Qwen/Qwen1.5-7B-Chat`
|
||||
- **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
|
||||
- **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
|
||||
- **Completion Mode**: `Completion`
|
||||
|
||||
:::{image} /assets/deployment/dify-settings.png
|
||||
:::
|
||||
|
||||
- To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
|
||||
|
||||
:::{image} /assets/deployment/dify-create-chatbot.png
|
||||
:::
|
||||
|
||||
- Click the chatbot you just created to open the chat interface and start interacting with the model:
|
||||
|
||||
:::{image} /assets/deployment/dify-chat.png
|
||||
:::
|
||||
@@ -1,102 +0,0 @@
|
||||
(deployment-dstack)=
|
||||
|
||||
# dstack
|
||||
|
||||
:::{raw} html
|
||||
<p align="center">
|
||||
<img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
|
||||
</p>
|
||||
:::
|
||||
|
||||
vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
|
||||
|
||||
To install dstack client, run:
|
||||
|
||||
```console
|
||||
pip install "dstack[all]
|
||||
dstack server
|
||||
```
|
||||
|
||||
Next, to configure your dstack project, run:
|
||||
|
||||
```console
|
||||
mkdir -p vllm-dstack
|
||||
cd vllm-dstack
|
||||
dstack init
|
||||
```
|
||||
|
||||
Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
|
||||
|
||||
```yaml
|
||||
type: service
|
||||
|
||||
python: "3.11"
|
||||
env:
|
||||
- MODEL=NousResearch/Llama-2-7b-chat-hf
|
||||
port: 8000
|
||||
resources:
|
||||
gpu: 24GB
|
||||
commands:
|
||||
- pip install vllm
|
||||
- vllm serve $MODEL --port 8000
|
||||
model:
|
||||
format: openai
|
||||
type: chat
|
||||
name: NousResearch/Llama-2-7b-chat-hf
|
||||
```
|
||||
|
||||
Then, run the following CLI for provisioning:
|
||||
|
||||
```console
|
||||
$ dstack run . -f serve.dstack.yml
|
||||
|
||||
⠸ Getting run plan...
|
||||
Configuration serve.dstack.yml
|
||||
Project deep-diver-main
|
||||
User deep-diver
|
||||
Min resources 2..xCPU, 8GB.., 1xGPU (24GB)
|
||||
Max price -
|
||||
Max duration -
|
||||
Spot policy auto
|
||||
Retry policy no
|
||||
|
||||
# BACKEND REGION INSTANCE RESOURCES SPOT PRICE
|
||||
1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
|
||||
2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
|
||||
3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
|
||||
...
|
||||
Shown 3 of 193 offers, $5.876 max
|
||||
|
||||
Continue? [y/n]: y
|
||||
⠙ Submitting run...
|
||||
⠏ Launching spicy-treefrog-1 (pulling)
|
||||
spicy-treefrog-1 provisioning completed (running)
|
||||
Service is published at ...
|
||||
```
|
||||
|
||||
After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url="https://gateway.<gateway domain>",
|
||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="NousResearch/Llama-2-7b-chat-hf",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Compose a poem that explains the concept of recursion in programming.",
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
:::{note}
|
||||
dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
|
||||
:::
|
||||
@@ -1,250 +0,0 @@
|
||||
(deployment-helm)=
|
||||
|
||||
# Helm
|
||||
|
||||
A Helm chart to deploy vLLM for Kubernetes
|
||||
|
||||
Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
|
||||
|
||||
This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before you begin, ensure that you have the following:
|
||||
|
||||
- A running Kubernetes cluster
|
||||
- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
|
||||
- Available GPU resources in your cluster
|
||||
- S3 with the model which will be deployed
|
||||
|
||||
## Installing the chart
|
||||
|
||||
To install the chart with the release name `test-vllm`:
|
||||
|
||||
```console
|
||||
helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
|
||||
```
|
||||
|
||||
## Uninstalling the Chart
|
||||
|
||||
To uninstall the `test-vllm` deployment:
|
||||
|
||||
```console
|
||||
helm uninstall test-vllm --namespace=ns-vllm
|
||||
```
|
||||
|
||||
The command removes all the Kubernetes components associated with the
|
||||
chart **including persistent volumes** and deletes the release.
|
||||
|
||||
## Architecture
|
||||
|
||||
:::{image} /assets/deployment/architecture_helm_deployment.png
|
||||
:::
|
||||
|
||||
## Values
|
||||
|
||||
:::{list-table}
|
||||
:widths: 25 25 25 25
|
||||
:header-rows: 1
|
||||
|
||||
- * Key
|
||||
* Type
|
||||
* Default
|
||||
* Description
|
||||
- * autoscaling
|
||||
* object
|
||||
* {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
|
||||
* Autoscaling configuration
|
||||
- * autoscaling.enabled
|
||||
* bool
|
||||
* false
|
||||
* Enable autoscaling
|
||||
- * autoscaling.maxReplicas
|
||||
* int
|
||||
* 100
|
||||
* Maximum replicas
|
||||
- * autoscaling.minReplicas
|
||||
* int
|
||||
* 1
|
||||
* Minimum replicas
|
||||
- * autoscaling.targetCPUUtilizationPercentage
|
||||
* int
|
||||
* 80
|
||||
* Target CPU utilization for autoscaling
|
||||
- * configs
|
||||
* object
|
||||
* {}
|
||||
* Configmap
|
||||
- * containerPort
|
||||
* int
|
||||
* 8000
|
||||
* Container port
|
||||
- * customObjects
|
||||
* list
|
||||
* []
|
||||
* Custom Objects configuration
|
||||
- * deploymentStrategy
|
||||
* object
|
||||
* {}
|
||||
* Deployment strategy configuration
|
||||
- * externalConfigs
|
||||
* list
|
||||
* []
|
||||
* External configuration
|
||||
- * extraContainers
|
||||
* list
|
||||
* []
|
||||
* Additional containers configuration
|
||||
- * extraInit
|
||||
* object
|
||||
* {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
|
||||
* Additional configuration for the init container
|
||||
- * extraInit.pvcStorage
|
||||
* string
|
||||
* "50Gi"
|
||||
* Storage size of the s3
|
||||
- * extraInit.s3modelpath
|
||||
* string
|
||||
* "relative_s3_model_path/opt-125m"
|
||||
* Path of the model on the s3 which hosts model weights and config files
|
||||
- * extraInit.awsEc2MetadataDisabled
|
||||
* boolean
|
||||
* true
|
||||
* Disables the use of the Amazon EC2 instance metadata service
|
||||
- * extraPorts
|
||||
* list
|
||||
* []
|
||||
* Additional ports configuration
|
||||
- * gpuModels
|
||||
* list
|
||||
* ["TYPE_GPU_USED"]
|
||||
* Type of gpu used
|
||||
- * image
|
||||
* object
|
||||
* {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
|
||||
* Image configuration
|
||||
- * image.command
|
||||
* list
|
||||
* ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
|
||||
* Container launch command
|
||||
- * image.repository
|
||||
* string
|
||||
* "vllm/vllm-openai"
|
||||
* Image repository
|
||||
- * image.tag
|
||||
* string
|
||||
* "latest"
|
||||
* Image tag
|
||||
- * livenessProbe
|
||||
* object
|
||||
* {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
|
||||
* Liveness probe configuration
|
||||
- * livenessProbe.failureThreshold
|
||||
* int
|
||||
* 3
|
||||
* Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
|
||||
- * livenessProbe.httpGet
|
||||
* object
|
||||
* {"path":"/health","port":8000}
|
||||
* Configuration of the Kubelet http request on the server
|
||||
- * livenessProbe.httpGet.path
|
||||
* string
|
||||
* "/health"
|
||||
* Path to access on the HTTP server
|
||||
- * livenessProbe.httpGet.port
|
||||
* int
|
||||
* 8000
|
||||
* Name or number of the port to access on the container, on which the server is listening
|
||||
- * livenessProbe.initialDelaySeconds
|
||||
* int
|
||||
* 15
|
||||
* Number of seconds after the container has started before liveness probe is initiated
|
||||
- * livenessProbe.periodSeconds
|
||||
* int
|
||||
* 10
|
||||
* How often (in seconds) to perform the liveness probe
|
||||
- * maxUnavailablePodDisruptionBudget
|
||||
* string
|
||||
* ""
|
||||
* Disruption Budget Configuration
|
||||
- * readinessProbe
|
||||
* object
|
||||
* {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
|
||||
* Readiness probe configuration
|
||||
- * readinessProbe.failureThreshold
|
||||
* int
|
||||
* 3
|
||||
* Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
|
||||
- * readinessProbe.httpGet
|
||||
* object
|
||||
* {"path":"/health","port":8000}
|
||||
* Configuration of the Kubelet http request on the server
|
||||
- * readinessProbe.httpGet.path
|
||||
* string
|
||||
* "/health"
|
||||
* Path to access on the HTTP server
|
||||
- * readinessProbe.httpGet.port
|
||||
* int
|
||||
* 8000
|
||||
* Name or number of the port to access on the container, on which the server is listening
|
||||
- * readinessProbe.initialDelaySeconds
|
||||
* int
|
||||
* 5
|
||||
* Number of seconds after the container has started before readiness probe is initiated
|
||||
- * readinessProbe.periodSeconds
|
||||
* int
|
||||
* 5
|
||||
* How often (in seconds) to perform the readiness probe
|
||||
- * replicaCount
|
||||
* int
|
||||
* 1
|
||||
* Number of replicas
|
||||
- * resources
|
||||
* object
|
||||
* {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
|
||||
* Resource configuration
|
||||
- * resources.limits."nvidia.com/gpu"
|
||||
* int
|
||||
* 1
|
||||
* Number of gpus used
|
||||
- * resources.limits.cpu
|
||||
* int
|
||||
* 4
|
||||
* Number of CPUs
|
||||
- * resources.limits.memory
|
||||
* string
|
||||
* "16Gi"
|
||||
* CPU memory configuration
|
||||
- * resources.requests."nvidia.com/gpu"
|
||||
* int
|
||||
* 1
|
||||
* Number of gpus used
|
||||
- * resources.requests.cpu
|
||||
* int
|
||||
* 4
|
||||
* Number of CPUs
|
||||
- * resources.requests.memory
|
||||
* string
|
||||
* "16Gi"
|
||||
* CPU memory configuration
|
||||
- * secrets
|
||||
* object
|
||||
* {}
|
||||
* Secrets configuration
|
||||
- * serviceName
|
||||
* string
|
||||
*
|
||||
* Service name
|
||||
- * servicePort
|
||||
* int
|
||||
* 80
|
||||
* Service port
|
||||
- * labels.environment
|
||||
* string
|
||||
* test
|
||||
* Environment name
|
||||
- * labels.release
|
||||
* string
|
||||
* test
|
||||
* Release name
|
||||
:::
|
||||
@@ -1,22 +0,0 @@
|
||||
# Using other frameworks
|
||||
|
||||
:::{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
anything-llm
|
||||
bentoml
|
||||
cerebrium
|
||||
chatbox
|
||||
dify
|
||||
dstack
|
||||
helm
|
||||
litellm
|
||||
lobe-chat
|
||||
lws
|
||||
modal
|
||||
open-webui
|
||||
retrieval_augmented_generation
|
||||
skypilot
|
||||
streamlit
|
||||
triton
|
||||
:::
|
||||
@@ -1,75 +0,0 @@
|
||||
(deployment-litellm)=
|
||||
|
||||
# LiteLLM
|
||||
|
||||
[LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
|
||||
|
||||
LiteLLM manages:
|
||||
|
||||
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
|
||||
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
|
||||
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
|
||||
- Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server (LLM Gateway)](https://docs.litellm.ai/docs/simple_proxy)
|
||||
|
||||
And LiteLLM supports all models on VLLM.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM and litellm environment
|
||||
|
||||
```console
|
||||
pip install vllm litellm
|
||||
```
|
||||
|
||||
## Deploy
|
||||
|
||||
### Chat completion
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
- Call it with litellm:
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
|
||||
# hosted_vllm is prefix key word and necessary
|
||||
response = litellm.completion(
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
### Embeddings
|
||||
|
||||
- Start the vLLM server with the supported embedding model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve BAAI/bge-base-en-v1.5
|
||||
```
|
||||
|
||||
- Call it with litellm:
|
||||
|
||||
```python
|
||||
from litellm import embedding
|
||||
import os
|
||||
|
||||
os.environ["HOSTED_VLLM_API_BASE"] = "http://{your-vllm-server-host}:{your-vllm-server-port}/v1"
|
||||
|
||||
# hosted_vllm is prefix key word and necessary
|
||||
# pass the vllm model name
|
||||
embedding = embedding(model="hosted_vllm/BAAI/bge-base-en-v1.5", input=["Hello world"])
|
||||
|
||||
print(embedding)
|
||||
```
|
||||
|
||||
For details, see the tutorial [Using vLLM in LiteLLM](https://docs.litellm.ai/docs/providers/vllm).
|
||||
@@ -1,13 +0,0 @@
|
||||
(deployment-lobe-chat)=
|
||||
|
||||
# Lobe Chat
|
||||
|
||||
[Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
|
||||
|
||||
Supports speech-synthesis, multi-modal, and extensible (function call) plugin system.
|
||||
|
||||
One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.
|
||||
|
||||
It supports vLLM as a AI model provider to efficiently serve large language models.
|
||||
|
||||
For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
|
||||
@@ -1,198 +0,0 @@
|
||||
(deployment-lws)=
|
||||
|
||||
# LWS
|
||||
|
||||
LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
|
||||
A major use case is for multi-host/multi-node distributed inference.
|
||||
|
||||
vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
* At least two Kubernetes nodes, each with 8 GPUs, are required.
|
||||
* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
|
||||
|
||||
## Deploy and Serve
|
||||
|
||||
Deploy the following yaml file `lws.yaml`
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: vllm
|
||||
spec:
|
||||
replicas: 2
|
||||
leaderWorkerTemplate:
|
||||
size: 2
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
image: docker.io/vllm/vllm-openai:latest
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: <your-hf-token>
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
memory: 1124Gi
|
||||
ephemeral-storage: 800Gi
|
||||
requests:
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-worker
|
||||
image: docker.io/vllm/vllm-openai:latest
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
memory: 1124Gi
|
||||
ephemeral-storage: 800Gi
|
||||
requests:
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: <your-hf-token>
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-leader
|
||||
spec:
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
protocol: TCP
|
||||
targetPort: 8080
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: vllm
|
||||
role: leader
|
||||
type: ClusterIP
|
||||
```
|
||||
|
||||
```bash
|
||||
kubectl apply -f lws.yaml
|
||||
```
|
||||
|
||||
Verify the status of the pods:
|
||||
|
||||
```bash
|
||||
kubectl get pods
|
||||
```
|
||||
|
||||
Should get an output similar to this:
|
||||
|
||||
```bash
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
vllm-0 1/1 Running 0 2s
|
||||
vllm-0-1 1/1 Running 0 2s
|
||||
vllm-1 1/1 Running 0 2s
|
||||
vllm-1-1 1/1 Running 0 2s
|
||||
```
|
||||
|
||||
Verify that the distributed tensor-parallel inference works:
|
||||
|
||||
```bash
|
||||
kubectl logs vllm-0 |grep -i "Loading model weights took"
|
||||
```
|
||||
|
||||
Should get something similar to this:
|
||||
|
||||
```text
|
||||
INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
|
||||
(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
|
||||
```
|
||||
|
||||
## Access ClusterIP service
|
||||
|
||||
```bash
|
||||
# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
|
||||
kubectl port-forward svc/vllm-leader 8080:8080
|
||||
```
|
||||
|
||||
The output should be similar to the following:
|
||||
|
||||
```text
|
||||
Forwarding from 127.0.0.1:8080 -> 8080
|
||||
Forwarding from [::1]:8080 -> 8080
|
||||
```
|
||||
|
||||
## Serve the model
|
||||
|
||||
Open another terminal and send a request
|
||||
|
||||
```text
|
||||
curl http://localhost:8080/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 7,
|
||||
"temperature": 0
|
||||
}'
|
||||
```
|
||||
|
||||
The output should be similar to the following
|
||||
|
||||
```text
|
||||
{
|
||||
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
|
||||
"object": "text_completion",
|
||||
"created": 1715138766,
|
||||
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"text": " top destination for foodies, with",
|
||||
"logprobs": null,
|
||||
"finish_reason": "length",
|
||||
"stop_reason": null
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 5,
|
||||
"total_tokens": 12,
|
||||
"completion_tokens": 7
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -1,7 +0,0 @@
|
||||
(deployment-modal)=
|
||||
|
||||
# Modal
|
||||
|
||||
vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
|
||||
|
||||
For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).
|
||||
@@ -1,29 +0,0 @@
|
||||
(deployment-open-webui)=
|
||||
|
||||
# Open WebUI
|
||||
|
||||
1. Install the [Docker](https://docs.docker.com/engine/install/)
|
||||
|
||||
2. Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
|
||||
|
||||
```console
|
||||
docker run -d -p 3000:8080 \
|
||||
--name open-webui \
|
||||
-v open-webui:/app/backend/data \
|
||||
-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
|
||||
--restart always \
|
||||
ghcr.io/open-webui/open-webui:main
|
||||
```
|
||||
|
||||
1. Open it in the browser: <http://open-webui-host:3000/>
|
||||
|
||||
On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
|
||||
|
||||
:::{image} /assets/deployment/open_webui.png
|
||||
:::
|
||||
@@ -1,84 +0,0 @@
|
||||
(deployment-retrieval-augmented-generation)=
|
||||
|
||||
# Retrieval-Augmented Generation
|
||||
|
||||
[Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
|
||||
|
||||
Here are the integrations:
|
||||
- vLLM + [langchain](https://github.com/langchain-ai/langchain) + [milvus](https://github.com/milvus-io/milvus)
|
||||
- vLLM + [llamaindex](https://github.com/run-llama/llama_index) + [milvus](https://github.com/milvus-io/milvus)
|
||||
|
||||
## vLLM + langchain
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Setup vLLM and langchain environment
|
||||
|
||||
```console
|
||||
pip install -U vllm \
|
||||
langchain_milvus langchain_openai \
|
||||
langchain_community beautifulsoup4 \
|
||||
langchain-text-splitters
|
||||
```
|
||||
|
||||
### Deploy
|
||||
|
||||
- Start the vLLM server with the supported embedding model, e.g.
|
||||
|
||||
```console
|
||||
# Start embedding service (port 8000)
|
||||
vllm serve ssmits/Qwen2-7B-Instruct-embed-base
|
||||
```
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
# Start chat service (port 8001)
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
```
|
||||
|
||||
- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
|
||||
|
||||
- Run the script
|
||||
|
||||
```python
|
||||
python retrieval_augmented_generation_with_langchain.py
|
||||
```
|
||||
|
||||
## vLLM + llamaindex
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Setup vLLM and llamaindex environment
|
||||
|
||||
```console
|
||||
pip install vllm \
|
||||
llama-index llama-index-readers-web \
|
||||
llama-index-llms-openai-like \
|
||||
llama-index-embeddings-openai-like \
|
||||
llama-index-vector-stores-milvus \
|
||||
```
|
||||
|
||||
### Deploy
|
||||
|
||||
- Start the vLLM server with the supported embedding model, e.g.
|
||||
|
||||
```console
|
||||
# Start embedding service (port 8000)
|
||||
vllm serve ssmits/Qwen2-7B-Instruct-embed-base
|
||||
```
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
# Start chat service (port 8001)
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
```
|
||||
|
||||
- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
|
||||
|
||||
- Run the script
|
||||
|
||||
```python
|
||||
python retrieval_augmented_generation_with_llamaindex.py
|
||||
```
|
||||
@@ -1,345 +0,0 @@
|
||||
(deployment-skypilot)=
|
||||
|
||||
# SkyPilot
|
||||
|
||||
:::{raw} html
|
||||
<p align="center">
|
||||
<img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
|
||||
</p>
|
||||
:::
|
||||
|
||||
vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`.
|
||||
- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
|
||||
- Check that `sky check` shows clouds or Kubernetes are enabled.
|
||||
|
||||
```console
|
||||
pip install skypilot-nightly
|
||||
sky check
|
||||
```
|
||||
|
||||
## Run on a single instance
|
||||
|
||||
See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
|
||||
|
||||
```yaml
|
||||
resources:
|
||||
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
|
||||
use_spot: True
|
||||
disk_size: 512 # Ensure model checkpoints can fit.
|
||||
disk_tier: best
|
||||
ports: 8081 # Expose to internet traffic.
|
||||
|
||||
envs:
|
||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||
|
||||
setup: |
|
||||
conda create -n vllm python=3.10 -y
|
||||
conda activate vllm
|
||||
|
||||
pip install vllm==0.4.0.post1
|
||||
# Install Gradio for web UI.
|
||||
pip install gradio openai
|
||||
pip install flash-attn==2.5.7
|
||||
|
||||
run: |
|
||||
conda activate vllm
|
||||
echo 'Starting vllm api server...'
|
||||
python -u -m vllm.entrypoints.openai.api_server \
|
||||
--port 8081 \
|
||||
--model $MODEL_NAME \
|
||||
--trust-remote-code \
|
||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||
2>&1 | tee api_server.log &
|
||||
|
||||
echo 'Waiting for vllm api server to start...'
|
||||
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
|
||||
|
||||
echo 'Starting gradio server...'
|
||||
git clone https://github.com/vllm-project/vllm.git || true
|
||||
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||
-m $MODEL_NAME \
|
||||
--port 8811 \
|
||||
--model-url http://localhost:8081/v1 \
|
||||
--stop-token-ids 128009,128001
|
||||
```
|
||||
|
||||
Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
|
||||
|
||||
```console
|
||||
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
|
||||
```
|
||||
|
||||
Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
|
||||
|
||||
```console
|
||||
(task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
|
||||
```
|
||||
|
||||
**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
|
||||
|
||||
```console
|
||||
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
|
||||
```
|
||||
|
||||
## Scale up to multiple replicas
|
||||
|
||||
SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
|
||||
|
||||
```yaml
|
||||
service:
|
||||
replicas: 2
|
||||
# An actual request for readiness probe.
|
||||
readiness_probe:
|
||||
path: /v1/chat/completions
|
||||
post_data:
|
||||
model: $MODEL_NAME
|
||||
messages:
|
||||
- role: user
|
||||
content: Hello! What is your name?
|
||||
max_completion_tokens: 1
|
||||
```
|
||||
|
||||
:::{raw} html
|
||||
<details>
|
||||
<summary>Click to see the full recipe YAML</summary>
|
||||
:::
|
||||
|
||||
```yaml
|
||||
service:
|
||||
replicas: 2
|
||||
# An actual request for readiness probe.
|
||||
readiness_probe:
|
||||
path: /v1/chat/completions
|
||||
post_data:
|
||||
model: $MODEL_NAME
|
||||
messages:
|
||||
- role: user
|
||||
content: Hello! What is your name?
|
||||
max_completion_tokens: 1
|
||||
|
||||
resources:
|
||||
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
|
||||
use_spot: True
|
||||
disk_size: 512 # Ensure model checkpoints can fit.
|
||||
disk_tier: best
|
||||
ports: 8081 # Expose to internet traffic.
|
||||
|
||||
envs:
|
||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||
|
||||
setup: |
|
||||
conda create -n vllm python=3.10 -y
|
||||
conda activate vllm
|
||||
|
||||
pip install vllm==0.4.0.post1
|
||||
# Install Gradio for web UI.
|
||||
pip install gradio openai
|
||||
pip install flash-attn==2.5.7
|
||||
|
||||
run: |
|
||||
conda activate vllm
|
||||
echo 'Starting vllm api server...'
|
||||
python -u -m vllm.entrypoints.openai.api_server \
|
||||
--port 8081 \
|
||||
--model $MODEL_NAME \
|
||||
--trust-remote-code \
|
||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||
2>&1 | tee api_server.log
|
||||
```
|
||||
|
||||
:::{raw} html
|
||||
</details>
|
||||
:::
|
||||
|
||||
Start the serving the Llama-3 8B model on multiple replicas:
|
||||
|
||||
```console
|
||||
HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
|
||||
```
|
||||
|
||||
Wait until the service is ready:
|
||||
|
||||
```console
|
||||
watch -n10 sky serve status vllm
|
||||
```
|
||||
|
||||
:::{raw} html
|
||||
<details>
|
||||
<summary>Example outputs:</summary>
|
||||
:::
|
||||
|
||||
```console
|
||||
Services
|
||||
NAME VERSION UPTIME STATUS REPLICAS ENDPOINT
|
||||
vllm 1 35s READY 2/2 xx.yy.zz.100:30001
|
||||
|
||||
Service Replicas
|
||||
SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION
|
||||
vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4
|
||||
vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4
|
||||
```
|
||||
|
||||
:::{raw} html
|
||||
</details>
|
||||
:::
|
||||
|
||||
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
|
||||
|
||||
```console
|
||||
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
|
||||
curl -L http://$ENDPOINT/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Who are you?"
|
||||
}
|
||||
],
|
||||
"stop_token_ids": [128009, 128001]
|
||||
}'
|
||||
```
|
||||
|
||||
To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
|
||||
|
||||
```yaml
|
||||
service:
|
||||
replica_policy:
|
||||
min_replicas: 2
|
||||
max_replicas: 4
|
||||
target_qps_per_replica: 2
|
||||
```
|
||||
|
||||
This will scale the service up to when the QPS exceeds 2 for each replica.
|
||||
|
||||
:::{raw} html
|
||||
<details>
|
||||
<summary>Click to see the full recipe YAML</summary>
|
||||
:::
|
||||
|
||||
```yaml
|
||||
service:
|
||||
replica_policy:
|
||||
min_replicas: 2
|
||||
max_replicas: 4
|
||||
target_qps_per_replica: 2
|
||||
# An actual request for readiness probe.
|
||||
readiness_probe:
|
||||
path: /v1/chat/completions
|
||||
post_data:
|
||||
model: $MODEL_NAME
|
||||
messages:
|
||||
- role: user
|
||||
content: Hello! What is your name?
|
||||
max_completion_tokens: 1
|
||||
|
||||
resources:
|
||||
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
|
||||
use_spot: True
|
||||
disk_size: 512 # Ensure model checkpoints can fit.
|
||||
disk_tier: best
|
||||
ports: 8081 # Expose to internet traffic.
|
||||
|
||||
envs:
|
||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||
|
||||
setup: |
|
||||
conda create -n vllm python=3.10 -y
|
||||
conda activate vllm
|
||||
|
||||
pip install vllm==0.4.0.post1
|
||||
# Install Gradio for web UI.
|
||||
pip install gradio openai
|
||||
pip install flash-attn==2.5.7
|
||||
|
||||
run: |
|
||||
conda activate vllm
|
||||
echo 'Starting vllm api server...'
|
||||
python -u -m vllm.entrypoints.openai.api_server \
|
||||
--port 8081 \
|
||||
--model $MODEL_NAME \
|
||||
--trust-remote-code \
|
||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||
2>&1 | tee api_server.log
|
||||
```
|
||||
|
||||
:::{raw} html
|
||||
</details>
|
||||
:::
|
||||
|
||||
To update the service with the new config:
|
||||
|
||||
```console
|
||||
HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
|
||||
```
|
||||
|
||||
To stop the service:
|
||||
|
||||
```console
|
||||
sky serve down vllm
|
||||
```
|
||||
|
||||
### **Optional**: Connect a GUI to the endpoint
|
||||
|
||||
It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
|
||||
|
||||
:::{raw} html
|
||||
<details>
|
||||
<summary>Click to see the full GUI YAML</summary>
|
||||
:::
|
||||
|
||||
```yaml
|
||||
envs:
|
||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
|
||||
|
||||
resources:
|
||||
cpus: 2
|
||||
|
||||
setup: |
|
||||
conda create -n vllm python=3.10 -y
|
||||
conda activate vllm
|
||||
|
||||
# Install Gradio for web UI.
|
||||
pip install gradio openai
|
||||
|
||||
run: |
|
||||
conda activate vllm
|
||||
export PATH=$PATH:/sbin
|
||||
|
||||
echo 'Starting gradio server...'
|
||||
git clone https://github.com/vllm-project/vllm.git || true
|
||||
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||
-m $MODEL_NAME \
|
||||
--port 8811 \
|
||||
--model-url http://$ENDPOINT/v1 \
|
||||
--stop-token-ids 128009,128001 | tee ~/gradio.log
|
||||
```
|
||||
|
||||
:::{raw} html
|
||||
</details>
|
||||
:::
|
||||
|
||||
1. Start the chat web UI:
|
||||
|
||||
```console
|
||||
sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
|
||||
```
|
||||
|
||||
2. Then, we can access the GUI at the returned gradio link:
|
||||
|
||||
```console
|
||||
| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
|
||||
```
|
||||
@@ -1,42 +0,0 @@
|
||||
(deployment-streamlit)=
|
||||
|
||||
# Streamlit
|
||||
|
||||
[Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
|
||||
|
||||
It can be quickly integrated with vLLM as a backend API server, enabling powerful LLM inference via API calls.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM environment
|
||||
|
||||
## Deploy
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
- Install streamlit and openai:
|
||||
|
||||
```console
|
||||
pip install streamlit openai
|
||||
```
|
||||
|
||||
- Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
|
||||
|
||||
- Start the streamlit web UI and start to chat:
|
||||
|
||||
```console
|
||||
streamlit run streamlit_openai_chatbot_webserver.py
|
||||
|
||||
# or specify the VLLM_API_BASE or VLLM_API_KEY
|
||||
VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" streamlit run streamlit_openai_chatbot_webserver.py
|
||||
|
||||
# start with debug mode to view more details
|
||||
streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
|
||||
```
|
||||
|
||||
:::{image} /assets/deployment/streamlit-chat.png
|
||||
:::
|
||||
@@ -1,5 +0,0 @@
|
||||
(deployment-triton)=
|
||||
|
||||
# NVIDIA Triton
|
||||
|
||||
The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
|
||||
@@ -1,11 +0,0 @@
|
||||
# External Integrations
|
||||
|
||||
:::{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
kserve
|
||||
kubeai
|
||||
llamastack
|
||||
llmaz
|
||||
production-stack
|
||||
:::
|
||||
@@ -1,7 +0,0 @@
|
||||
(deployment-kserve)=
|
||||
|
||||
# KServe
|
||||
|
||||
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
||||
|
||||
Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
|
||||
@@ -1,15 +0,0 @@
|
||||
(deployment-kubeai)=
|
||||
|
||||
# KubeAI
|
||||
|
||||
[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
|
||||
|
||||
Please see the Installation Guides for environment specific instructions:
|
||||
|
||||
- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
|
||||
- [EKS](https://www.kubeai.org/installation/eks/)
|
||||
- [GKE](https://www.kubeai.org/installation/gke/)
|
||||
|
||||
Once you have KubeAI installed, you can
|
||||
[configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/)
|
||||
using vLLM.
|
||||
@@ -1,38 +0,0 @@
|
||||
(deployment-llamastack)=
|
||||
|
||||
# Llama Stack
|
||||
|
||||
vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
|
||||
|
||||
To install Llama Stack, run
|
||||
|
||||
```console
|
||||
pip install llama-stack -q
|
||||
```
|
||||
|
||||
## Inference using OpenAI Compatible API
|
||||
|
||||
Then start Llama Stack server pointing to your vLLM server with the following configuration:
|
||||
|
||||
```yaml
|
||||
inference:
|
||||
- provider_id: vllm0
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: http://127.0.0.1:8000
|
||||
```
|
||||
|
||||
Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
|
||||
|
||||
## Inference via Embedded vLLM
|
||||
|
||||
An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm)
|
||||
is also available. This is a sample of configuration using that method:
|
||||
|
||||
```yaml
|
||||
inference
|
||||
- provider_type: vllm
|
||||
config:
|
||||
model: Llama3.1-8B-Instruct
|
||||
tensor_parallel_size: 4
|
||||
```
|
||||
@@ -1,7 +0,0 @@
|
||||
(deployment-llmaz)=
|
||||
|
||||
# llmaz
|
||||
|
||||
[llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
|
||||
|
||||
Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.
|
||||
@@ -1,154 +0,0 @@
|
||||
(deployment-production-stack)=
|
||||
|
||||
# Production stack
|
||||
|
||||
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with:
|
||||
|
||||
* **Upstream vLLM compatibility** – It wraps around upstream vLLM without modifying its code.
|
||||
* **Ease of use** – Simplified deployment via Helm charts and observability through Grafana dashboards.
|
||||
* **High performance** – Optimized for LLM workloads with features like multi-model support, model-aware and prefix-aware routing, fast vLLM bootstrapping, and KV cache offloading with [LMCache](https://github.com/LMCache/LMCache), among others.
|
||||
|
||||
If you are new to Kubernetes, don't worry: in the vLLM production stack [repo](https://github.com/vllm-project/production-stack), we provide a step-by-step [guide](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) and a [short video](https://www.youtube.com/watch?v=EsTJbQtzj0g) to set up everything and get started in **4 minutes**!
|
||||
|
||||
## Pre-requisite
|
||||
|
||||
Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
|
||||
|
||||
## Deployment using vLLM production stack
|
||||
|
||||
The standard vLLM production stack is installed using a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/utils/install-helm.sh) to install Helm on your GPU server.
|
||||
|
||||
To install the vLLM production stack, run the following commands on your desktop:
|
||||
|
||||
```bash
|
||||
sudo helm repo add vllm https://vllm-project.github.io/production-stack
|
||||
sudo helm install vllm vllm/vllm-stack -f tutorials/assets/values-01-minimal-example.yaml
|
||||
```
|
||||
|
||||
This will instantiate a vLLM-production-stack-based deployment named `vllm` that runs a small LLM (Facebook opt-125M model).
|
||||
|
||||
### Validate Installation
|
||||
|
||||
Monitor the deployment status using:
|
||||
|
||||
```bash
|
||||
sudo kubectl get pods
|
||||
```
|
||||
|
||||
And you will see that pods for the `vllm` deployment will transit to `Running` state.
|
||||
|
||||
```text
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
vllm-deployment-router-859d8fb668-2x2b7 1/1 Running 0 2m38s
|
||||
vllm-opt125m-deployment-vllm-84dfc9bd7-vb9bs 1/1 Running 0 2m38s
|
||||
```
|
||||
|
||||
**NOTE**: It may take some time for the containers to download the Docker images and LLM weights.
|
||||
|
||||
### Send a Query to the Stack
|
||||
|
||||
Forward the `vllm-router-service` port to the host machine:
|
||||
|
||||
```bash
|
||||
sudo kubectl port-forward svc/vllm-router-service 30080:80
|
||||
```
|
||||
|
||||
And then you can send out a query to the OpenAI-compatible API to check the available models:
|
||||
|
||||
```bash
|
||||
curl -o- http://localhost:30080/models
|
||||
```
|
||||
|
||||
Expected output:
|
||||
|
||||
```json
|
||||
{
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": "facebook/opt-125m",
|
||||
"object": "model",
|
||||
"created": 1737428424,
|
||||
"owned_by": "vllm",
|
||||
"root": null
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:30080/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "facebook/opt-125m",
|
||||
"prompt": "Once upon a time,",
|
||||
"max_tokens": 10
|
||||
}'
|
||||
```
|
||||
|
||||
Expected output:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "completion-id",
|
||||
"object": "text_completion",
|
||||
"created": 1737428424,
|
||||
"model": "facebook/opt-125m",
|
||||
"choices": [
|
||||
{
|
||||
"text": " there was a brave knight who...",
|
||||
"index": 0,
|
||||
"finish_reason": "length"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Uninstall
|
||||
|
||||
To remove the deployment, run:
|
||||
|
||||
```bash
|
||||
sudo helm uninstall vllm
|
||||
```
|
||||
|
||||
------
|
||||
|
||||
### (Advanced) Configuring vLLM production stack
|
||||
|
||||
The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
|
||||
|
||||
```yaml
|
||||
servingEngineSpec:
|
||||
runtimeClassName: ""
|
||||
modelSpec:
|
||||
- name: "opt125m"
|
||||
repository: "vllm/vllm-openai"
|
||||
tag: "latest"
|
||||
modelURL: "facebook/opt-125m"
|
||||
|
||||
replicaCount: 1
|
||||
|
||||
requestCPU: 6
|
||||
requestMemory: "16Gi"
|
||||
requestGPU: 1
|
||||
|
||||
pvcStorage: "10Gi"
|
||||
```
|
||||
|
||||
In this YAML configuration:
|
||||
* **`modelSpec`** includes:
|
||||
* `name`: A nickname that you prefer to call the model.
|
||||
* `repository`: Docker repository of vLLM.
|
||||
* `tag`: Docker image tag.
|
||||
* `modelURL`: The LLM model that you want to use.
|
||||
* **`replicaCount`**: Number of replicas.
|
||||
* **`requestCPU` and `requestMemory`**: Specifies the CPU and memory resource requests for the pod.
|
||||
* **`requestGPU`**: Specifies the number of GPUs required.
|
||||
* **`pvcStorage`**: Allocates persistent storage for the model.
|
||||
|
||||
**NOTE:** If you intend to set up two pods, please refer to this [YAML file](https://github.com/vllm-project/production-stack/blob/main/tutorials/assets/values-01-2pods-minimal-example.yaml).
|
||||
|
||||
**NOTE:** vLLM production stack offers many more features (*e.g.* CPU offloading and a wide range of routing algorithms). Please check out these [examples and tutorials](https://github.com/vllm-project/production-stack/tree/main/tutorials) and our [repo](https://github.com/vllm-project/production-stack) for more details!
|
||||
@@ -1,355 +0,0 @@
|
||||
(deployment-k8s)=
|
||||
|
||||
# Using Kubernetes
|
||||
|
||||
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
|
||||
|
||||
* [Deployment with CPUs](#deployment-with-cpus)
|
||||
* [Deployment with GPUs](#deployment-with-gpus)
|
||||
|
||||
Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
* [Helm](frameworks/helm.md)
|
||||
* [InftyAI/llmaz](integrations/llmaz.md)
|
||||
* [KServe](integrations/kserve.md)
|
||||
* [kubernetes-sigs/lws](frameworks/lws.md)
|
||||
* [meta-llama/llama-stack](integrations/llamastack.md)
|
||||
* [substratusai/kubeai](integrations/kubeai.md)
|
||||
* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
|
||||
* [vllm-project/production-stack](integrations/production-stack.md)
|
||||
|
||||
## Deployment with CPUs
|
||||
|
||||
:::{note}
|
||||
The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
|
||||
:::
|
||||
|
||||
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: vllm-models
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: hf-token-secret
|
||||
type: Opaque
|
||||
data:
|
||||
token: $(HF_TOKEN)
|
||||
EOF
|
||||
```
|
||||
|
||||
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vllm
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.cache/huggingface
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: vllm-models
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: vllm
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
type: ClusterIP
|
||||
EOF
|
||||
```
|
||||
|
||||
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
|
||||
|
||||
```console
|
||||
kubectl logs -l app.kubernetes.io/name=vllm
|
||||
...
|
||||
INFO: Started server process [1]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
```
|
||||
|
||||
## Deployment with GPUs
|
||||
|
||||
**Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
|
||||
|
||||
1. Create a PVC, Secret and Deployment for vLLM
|
||||
|
||||
PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: mistral-7b
|
||||
namespace: default
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
storageClassName: default
|
||||
volumeMode: Filesystem
|
||||
```
|
||||
|
||||
Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: hf-token-secret
|
||||
namespace: default
|
||||
type: Opaque
|
||||
stringData:
|
||||
token: "REPLACE_WITH_TOKEN"
|
||||
```
|
||||
|
||||
Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
|
||||
|
||||
Here are two examples for using NVIDIA GPU and AMD GPU.
|
||||
|
||||
NVIDIA GPU:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: mistral-7b
|
||||
namespace: default
|
||||
labels:
|
||||
app: mistral-7b
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: mistral-7b
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: mistral-7b
|
||||
spec:
|
||||
volumes:
|
||||
- name: cache-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: mistral-7b
|
||||
# vLLM needs to access the host's shared memory for tensor parallel inference.
|
||||
- name: shm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: "2Gi"
|
||||
containers:
|
||||
- name: mistral-7b
|
||||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
resources:
|
||||
limits:
|
||||
cpu: "10"
|
||||
memory: 20G
|
||||
nvidia.com/gpu: "1"
|
||||
requests:
|
||||
cpu: "2"
|
||||
memory: 6G
|
||||
nvidia.com/gpu: "1"
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache/huggingface
|
||||
name: cache-volume
|
||||
- name: shm
|
||||
mountPath: /dev/shm
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 5
|
||||
```
|
||||
|
||||
AMD GPU:
|
||||
|
||||
You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: mistral-7b
|
||||
namespace: default
|
||||
labels:
|
||||
app: mistral-7b
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: mistral-7b
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: mistral-7b
|
||||
spec:
|
||||
volumes:
|
||||
# PVC
|
||||
- name: cache-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: mistral-7b
|
||||
# vLLM needs to access the host's shared memory for tensor parallel inference.
|
||||
- name: shm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: "8Gi"
|
||||
hostNetwork: true
|
||||
hostIPC: true
|
||||
containers:
|
||||
- name: mistral-7b
|
||||
image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: Unconfined
|
||||
runAsGroup: 44
|
||||
capabilities:
|
||||
add:
|
||||
- SYS_PTRACE
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
resources:
|
||||
limits:
|
||||
cpu: "10"
|
||||
memory: 20G
|
||||
amd.com/gpu: "1"
|
||||
requests:
|
||||
cpu: "6"
|
||||
memory: 6G
|
||||
amd.com/gpu: "1"
|
||||
volumeMounts:
|
||||
- name: cache-volume
|
||||
mountPath: /root/.cache/huggingface
|
||||
- name: shm
|
||||
mountPath: /dev/shm
|
||||
```
|
||||
|
||||
You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
|
||||
|
||||
2. Create a Kubernetes Service for vLLM
|
||||
|
||||
Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: mistral-7b
|
||||
namespace: default
|
||||
spec:
|
||||
ports:
|
||||
- name: http-mistral-7b
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: 8000
|
||||
# The label selector should match the deployment labels & it is useful for prefix caching feature
|
||||
selector:
|
||||
app: mistral-7b
|
||||
sessionAffinity: None
|
||||
type: ClusterIP
|
||||
```
|
||||
|
||||
3. Deploy and Test
|
||||
|
||||
Apply the deployment and service configurations using `kubectl apply -f <filename>`:
|
||||
|
||||
```console
|
||||
kubectl apply -f deployment.yaml
|
||||
kubectl apply -f service.yaml
|
||||
```
|
||||
|
||||
To test the deployment, run the following `curl` command:
|
||||
|
||||
```console
|
||||
curl http://mistral-7b.default.svc.cluster.local/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "mistralai/Mistral-7B-Instruct-v0.3",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 7,
|
||||
"temperature": 0
|
||||
}'
|
||||
```
|
||||
|
||||
If the service is correctly deployed, you should receive a response from the vLLM model.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
|
||||
@@ -1,133 +0,0 @@
|
||||
(nginxloadbalancer)=
|
||||
|
||||
# Using Nginx
|
||||
|
||||
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
|
||||
|
||||
Table of contents:
|
||||
|
||||
1. [Build Nginx Container](#nginxloadbalancer-nginx-build)
|
||||
2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf)
|
||||
3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container)
|
||||
4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network)
|
||||
5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container)
|
||||
6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx)
|
||||
7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx)
|
||||
|
||||
(nginxloadbalancer-nginx-build)=
|
||||
|
||||
## Build Nginx Container
|
||||
|
||||
This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
|
||||
|
||||
```console
|
||||
export vllm_root=`pwd`
|
||||
```
|
||||
|
||||
Create a file named `Dockerfile.nginx`:
|
||||
|
||||
```console
|
||||
FROM nginx:latest
|
||||
RUN rm /etc/nginx/conf.d/default.conf
|
||||
EXPOSE 80
|
||||
CMD ["nginx", "-g", "daemon off;"]
|
||||
```
|
||||
|
||||
Build the container:
|
||||
|
||||
```console
|
||||
docker build . -f Dockerfile.nginx --tag nginx-lb
|
||||
```
|
||||
|
||||
(nginxloadbalancer-nginx-conf)=
|
||||
|
||||
## Create Simple Nginx Config file
|
||||
|
||||
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
||||
|
||||
```console
|
||||
upstream backend {
|
||||
least_conn;
|
||||
server vllm0:8000 max_fails=3 fail_timeout=10000s;
|
||||
server vllm1:8000 max_fails=3 fail_timeout=10000s;
|
||||
}
|
||||
server {
|
||||
listen 80;
|
||||
location / {
|
||||
proxy_pass http://backend;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
(nginxloadbalancer-nginx-vllm-container)=
|
||||
|
||||
## Build vLLM Container
|
||||
|
||||
```console
|
||||
cd $vllm_root
|
||||
docker build -f docker/Dockerfile . --tag vllm
|
||||
```
|
||||
|
||||
If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
|
||||
|
||||
```console
|
||||
cd $vllm_root
|
||||
docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
|
||||
```
|
||||
|
||||
(nginxloadbalancer-nginx-docker-network)=
|
||||
|
||||
## Create Docker Network
|
||||
|
||||
```console
|
||||
docker network create vllm_nginx
|
||||
```
|
||||
|
||||
(nginxloadbalancer-nginx-launch-container)=
|
||||
|
||||
## Launch vLLM Containers
|
||||
|
||||
Notes:
|
||||
|
||||
- If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below.
|
||||
- If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again.
|
||||
- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
|
||||
- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
|
||||
|
||||
```console
|
||||
mkdir -p ~/.cache/huggingface/hub/
|
||||
hf_cache_dir=~/.cache/huggingface/
|
||||
docker run -itd --ipc host --network vllm_nginx --gpus device=0 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
|
||||
docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
|
||||
```
|
||||
|
||||
:::{note}
|
||||
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
|
||||
:::
|
||||
|
||||
(nginxloadbalancer-nginx-launch-nginx)=
|
||||
|
||||
## Launch Nginx
|
||||
|
||||
```console
|
||||
docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest
|
||||
```
|
||||
|
||||
(nginxloadbalancer-nginx-verify-nginx)=
|
||||
|
||||
## Verify That vLLM Servers Are Ready
|
||||
|
||||
```console
|
||||
docker logs vllm0 | grep Uvicorn
|
||||
docker logs vllm1 | grep Uvicorn
|
||||
```
|
||||
|
||||
Both outputs should look like this:
|
||||
|
||||
```console
|
||||
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
```
|
||||
@@ -1,97 +0,0 @@
|
||||
# Security Guide
|
||||
|
||||
## Inter-Node Communication
|
||||
|
||||
All communications between nodes in a multi-node vLLM deployment are **insecure by default** and must be protected by placing the nodes on an isolated network. This includes:
|
||||
|
||||
1. PyTorch Distributed communications
|
||||
2. KV cache transfer communications
|
||||
3. Tensor, Pipeline, and Data parallel communications
|
||||
|
||||
### Configuration Options for Inter-Node Communications
|
||||
|
||||
The following options control inter-node communications in vLLM:
|
||||
|
||||
1. **Environment Variables:**
|
||||
- `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
|
||||
|
||||
2. **KV Cache Transfer Configuration:**
|
||||
- `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
|
||||
- `--kv-port`: The port for KV cache transfer communications (default: 14579)
|
||||
|
||||
3. **Data Parallel Configuration:**
|
||||
- `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
|
||||
- `data_parallel_master_port`: Port of the data parallel master (default: 29500)
|
||||
|
||||
### Notes on PyTorch Distributed
|
||||
|
||||
vLLM uses PyTorch's distributed features for some inter-node communication. For
|
||||
detailed information about PyTorch Distributed security considerations, please
|
||||
refer to the [PyTorch Security
|
||||
Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
|
||||
|
||||
Key points from the PyTorch security guide:
|
||||
- PyTorch Distributed features are intended for internal communication only
|
||||
- They are not built for use in untrusted environments or networks
|
||||
- No authorization protocol is included for performance reasons
|
||||
- Messages are sent unencrypted
|
||||
- Connections are accepted from anywhere without checks
|
||||
|
||||
### Security Recommendations
|
||||
|
||||
1. **Network Isolation:**
|
||||
- Deploy vLLM nodes on a dedicated, isolated network
|
||||
- Use network segmentation to prevent unauthorized access
|
||||
- Implement appropriate firewall rules
|
||||
|
||||
2. **Configuration Best Practices:**
|
||||
- Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
|
||||
- Configure firewalls to only allow necessary ports between nodes
|
||||
|
||||
3. **Access Control:**
|
||||
- Restrict physical and network access to the deployment environment
|
||||
- Implement proper authentication and authorization for management interfaces
|
||||
- Follow the principle of least privilege for all system components
|
||||
|
||||
## Security and Firewalls: Protecting Exposed vLLM Systems
|
||||
|
||||
While vLLM is designed to allow unsafe network services to be isolated to
|
||||
private networks, there are components—such as dependencies and underlying
|
||||
frameworks—that may open insecure services listening on all network interfaces,
|
||||
sometimes outside of vLLM's direct control.
|
||||
|
||||
A major concern is the use of `torch.distributed`, which vLLM leverages for
|
||||
distributed communication, including when using vLLM on a single host. When vLLM
|
||||
uses TCP initialization (see [PyTorch TCP Initialization
|
||||
documentation](https://docs.pytorch.org/docs/stable/distributed.html#tcp-initialization)),
|
||||
PyTorch creates a `TCPStore` that, by default, listens on all network
|
||||
interfaces. This means that unless additional protections are put in place,
|
||||
these services may be accessible to any host that can reach your machine via any
|
||||
network interface.
|
||||
|
||||
**From a PyTorch perspective, any use of `torch.distributed` should be
|
||||
considered insecure by default.** This is a known and intentional behavior from
|
||||
the PyTorch team.
|
||||
|
||||
### Firewall Configuration Guidance
|
||||
|
||||
The best way to protect your vLLM system is to carefully configure a firewall to
|
||||
expose only the minimum network surface area necessary. In most cases, this
|
||||
means:
|
||||
|
||||
- **Block all incoming connections except to the TCP port the API server is
|
||||
listening on.**
|
||||
|
||||
- Ensure that ports used for internal communication (such as those for
|
||||
`torch.distributed` and KV cache transfer) are only accessible from trusted
|
||||
hosts or networks.
|
||||
|
||||
- Never expose these internal ports to the public internet or untrusted
|
||||
networks.
|
||||
|
||||
Consult your operating system or application platform documentation for specific
|
||||
firewall configuration instructions.
|
||||
|
||||
## Reporting Security Vulnerabilities
|
||||
|
||||
If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
|
||||
@@ -1,252 +0,0 @@
|
||||
(arch-overview)=
|
||||
|
||||
# Architecture Overview
|
||||
|
||||
This document provides an overview of the vLLM architecture.
|
||||
|
||||
:::{contents} Table of Contents
|
||||
:depth: 2
|
||||
:local: true
|
||||
:::
|
||||
|
||||
## Entrypoints
|
||||
|
||||
vLLM provides a number of entrypoints for interacting with the system. The
|
||||
following diagram shows the relationship between them.
|
||||
|
||||
:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png
|
||||
:alt: Entrypoints Diagram
|
||||
:::
|
||||
|
||||
### LLM Class
|
||||
|
||||
The LLM class provides the primary Python interface for doing offline inference,
|
||||
which is interacting with a model without using a separate model inference
|
||||
server.
|
||||
|
||||
Here is a sample of `LLM` class usage:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Define a list of input prompts
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The capital of France is",
|
||||
"The largest ocean is",
|
||||
]
|
||||
|
||||
# Define sampling parameters
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Initialize the LLM engine with the OPT-125M model
|
||||
llm = LLM(model="facebook/opt-125m")
|
||||
|
||||
# Generate outputs for the input prompts
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the generated outputs
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
More API details can be found in the [Offline Inference]
|
||||
(#offline-inference-api) section of the API docs.
|
||||
|
||||
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
||||
|
||||
### OpenAI-Compatible API Server
|
||||
|
||||
The second primary interface to vLLM is via its OpenAI-compatible API server.
|
||||
This server can be started using the `vllm serve` command.
|
||||
|
||||
```bash
|
||||
vllm serve <model>
|
||||
```
|
||||
|
||||
The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
|
||||
|
||||
Sometimes you may see the API server entrypoint used directly instead of via the
|
||||
`vllm` CLI command. For example:
|
||||
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server --model <model>
|
||||
```
|
||||
|
||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||
|
||||
More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.
|
||||
|
||||
## LLM Engine
|
||||
|
||||
The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
|
||||
the vLLM system, handling model inference and asynchronous request processing.
|
||||
|
||||
:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png
|
||||
:alt: LLMEngine Diagram
|
||||
:::
|
||||
|
||||
### LLMEngine
|
||||
|
||||
The `LLMEngine` class is the core component of the vLLM engine. It is
|
||||
responsible for receiving requests from clients and generating outputs from the
|
||||
model. The `LLMEngine` includes input processing, model execution (possibly
|
||||
distributed across multiple hosts and/or GPUs), scheduling, and output
|
||||
processing.
|
||||
|
||||
- **Input Processing**: Handles tokenization of input text using the specified
|
||||
tokenizer.
|
||||
- **Scheduling**: Chooses which requests are processed in each step.
|
||||
- **Model Execution**: Manages the execution of the language model, including
|
||||
distributed execution across multiple GPUs.
|
||||
- **Output Processing**: Processes the outputs generated by the model, decoding the
|
||||
token IDs from a language model into human-readable text.
|
||||
|
||||
The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
|
||||
|
||||
### AsyncLLMEngine
|
||||
|
||||
The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class.
|
||||
It uses `asyncio` to create a background loop that continuously processes
|
||||
incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
|
||||
can handle multiple concurrent requests and stream outputs to clients.
|
||||
|
||||
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
|
||||
API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
|
||||
|
||||
The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
|
||||
|
||||
## Worker
|
||||
|
||||
A worker is a process that runs the model inference. vLLM follows the common
|
||||
practice of using one process to control one accelerator device, such as GPUs.
|
||||
For example, if we use tensor parallelism of size 2 and pipeline parallelism of
|
||||
size 2, we will have 4 workers in total. Workers are identified by their
|
||||
`rank` and `local_rank`. `rank` is used for global orchestration, while
|
||||
`local_rank` is mainly used for assigning the accelerator device and accessing
|
||||
local resources such as the file system and shared memory.
|
||||
|
||||
## Model Runner
|
||||
|
||||
Every worker has one model runner object, responsible for loading and running
|
||||
the model. Much of the model execution logic resides here, such as preparing
|
||||
input tensors and capturing cudagraphs.
|
||||
|
||||
## Model
|
||||
|
||||
Every model runner object has one model object, which is the actual
|
||||
`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various
|
||||
configurations affect the class we ultimately get.
|
||||
|
||||
## Class Hierarchy
|
||||
|
||||
The following figure shows the class hierarchy of vLLM:
|
||||
|
||||
> :::{figure} /assets/design/hierarchy.png
|
||||
> :align: center
|
||||
> :alt: query
|
||||
> :width: 100%
|
||||
> :::
|
||||
|
||||
There are several important design choices behind this class hierarchy:
|
||||
|
||||
1\. **Extensibility**: All classes in the hierarchy accept a configuration object
|
||||
containing all the necessary information. The [VllmConfig](https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036)
|
||||
class is the main configuration object that is passed around. The class
|
||||
hierarchy is quite deep, and every class needs to read the configuration it is
|
||||
interested in. By encapsulating all configurations in one object, we can easily
|
||||
pass the configuration object around and access the configuration we need.
|
||||
Suppose we want to add a new feature (this is often the case given how fast the
|
||||
field of LLM inference is evolving) that only touches the model runner. We will
|
||||
have to add a new configuration option in the `VllmConfig` class. Since we pass
|
||||
the whole config object around, we only need to add the configuration option to
|
||||
the `VllmConfig` class, and the model runner can access it directly. We don't
|
||||
need to change the constructor of the engine, worker, or model class to pass the
|
||||
new configuration option.
|
||||
|
||||
2\. **Uniformity**: The model runner needs a unified interface to create and
|
||||
initialize the model. vLLM supports more than 50 types of popular open-source
|
||||
models. Each model has its own initialization logic. If the constructor
|
||||
signature varies with models, the model runner does not know how to call the
|
||||
constructor accordingly, without complicated and error-prone inspection logic.
|
||||
By making the constructor of the model class uniform, the model runner can
|
||||
easily create and initialize the model without knowing the specific model type.
|
||||
This is also useful for composing models. Vision-language models often consist
|
||||
of a vision model and a language model. By making the constructor uniform, we
|
||||
can easily create a vision model and a language model and compose them into a
|
||||
vision-language model.
|
||||
|
||||
:::{note}
|
||||
To support this change, all vLLM models' signatures have been updated to:
|
||||
|
||||
```python
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
```
|
||||
|
||||
To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
|
||||
|
||||
```python
|
||||
class MyOldModel(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
lora_config: Optional[LoRAConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
...
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
class MyNewModel(MyOldModel):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
config = vllm_config.model_config.hf_config
|
||||
cache_config = vllm_config.cache_config
|
||||
quant_config = vllm_config.quant_config
|
||||
lora_config = vllm_config.lora_config
|
||||
super().__init__(config, cache_config, quant_config, lora_config, prefix)
|
||||
|
||||
if __version__ >= "0.6.4":
|
||||
MyModel = MyNewModel
|
||||
else:
|
||||
MyModel = MyOldModel
|
||||
```
|
||||
|
||||
This way, the model can work with both old and new versions of vLLM.
|
||||
:::
|
||||
|
||||
3\. **Sharding and Quantization at Initialization**: Certain features require
|
||||
changing the model weights. For example, tensor parallelism needs to shard the
|
||||
model weights, and quantization needs to quantize the model weights. There are
|
||||
two possible ways to implement this feature. One way is to change the model
|
||||
weights after the model is initialized. The other way is to change the model
|
||||
weights during the model initialization. vLLM chooses the latter. The first
|
||||
approach is not scalable to large models. Suppose we want to run a 405B model
|
||||
(with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should
|
||||
only load 50GB weights. If we change the model weights after the model is
|
||||
initialized, we need to load the full 810GB weights to every GPU and then shard
|
||||
the weights, leading to a huge memory overhead. Instead, if we shard the weights
|
||||
during the model initialization, every layer will only create a shard of the
|
||||
weights it needs, leading to a much smaller memory overhead. The same idea
|
||||
applies to quantization. Note that we also add an additional argument `prefix`
|
||||
to the model's constructor so that the model can initialize itself differently
|
||||
based on the prefix. This is useful for non-uniform quantization, where
|
||||
different parts of the model are quantized differently. The `prefix` is
|
||||
usually an empty string for the top-level model and a string like `"vision"`
|
||||
or `"language"` for the sub-models. In general, it matches the name of the
|
||||
module's state dict in the checkpoint file.
|
||||
|
||||
One disadvantage of this design is that it is hard to write unit tests for
|
||||
individual components in vLLM because every component needs to be initialized by
|
||||
a complete config object. We solve this problem by providing a default
|
||||
initialization function that creates a default config object with all fields set
|
||||
to `None`. If the component we want to test only cares about a few fields in
|
||||
the config object, we can create a default config object and set the fields we
|
||||
care about. This way, we can test the component in isolation. Note that many
|
||||
tests in vLLM are end-to-end tests that test the whole system, so this is not a
|
||||
big problem.
|
||||
|
||||
In summary, the complete config object `VllmConfig` can be treated as an
|
||||
engine-level global state that is shared among all vLLM classes.
|
||||
@@ -1,42 +0,0 @@
|
||||
(design-automatic-prefix-caching)=
|
||||
|
||||
# Automatic Prefix Caching
|
||||
|
||||
The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
|
||||
|
||||
To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
|
||||
|
||||
```text
|
||||
Block 1 Block 2 Block 3
|
||||
[A gentle breeze stirred] [the leaves as children] [laughed in the distance]
|
||||
Block 1: |<--- block tokens ---->|
|
||||
Block 2: |<------- prefix ------>| |<--- block tokens --->|
|
||||
Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
|
||||
```
|
||||
|
||||
In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping:
|
||||
|
||||
```text
|
||||
hash(prefix tokens + block tokens) <--> KV Block
|
||||
```
|
||||
|
||||
With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space.
|
||||
|
||||
This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
|
||||
|
||||
## Generalized Caching Policy
|
||||
|
||||
Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
|
||||
|
||||
Managing KV cache with a hash table allows us to implement flexible caching policies. As an example, in current vLLM, we implement the following eviction policy:
|
||||
|
||||
* When there are no free blocks left, we will evict a KV block with reference count (i.e., number of current requests using the block) equals 0.
|
||||
* If there are multiple blocks with reference count equals to 0, we prioritize to evict the least recently used block (LRU).
|
||||
* If there are multiple blocks whose last access time are the same, we prioritize the eviction of the block that is at the end of the longest prefix (i.e., has the maximum number of blocks before it).
|
||||
|
||||
Note that this eviction policy effectively implements the exact policy as in [RadixAttention](https://lmsys.org/blog/2024-01-17-sglang/) when applied to models with full attention, which prioritizes to evict reference count zero and least recent used leaf nodes in the prefix tree.
|
||||
|
||||
However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above:
|
||||
|
||||
* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
|
||||
* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
|
||||
@@ -1,36 +0,0 @@
|
||||
(huggingface-integration)=
|
||||
|
||||
# Integration with HuggingFace
|
||||
|
||||
This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
|
||||
|
||||
Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`.
|
||||
|
||||
1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process:
|
||||
|
||||
- If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
|
||||
- If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
|
||||
- If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
|
||||
|
||||
2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
|
||||
|
||||
3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
|
||||
|
||||
- HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
|
||||
- The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
|
||||
|
||||
4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation.
|
||||
|
||||
5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs.
|
||||
|
||||
Beyond that, there are two more things vLLM depends on HuggingFace for.
|
||||
|
||||
1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
|
||||
|
||||
2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
|
||||
|
||||
- It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
|
||||
|
||||
This completes the integration between vLLM and HuggingFace.
|
||||
|
||||
In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
|
||||
@@ -1,529 +0,0 @@
|
||||
(design-paged-attention)=
|
||||
|
||||
# vLLM Paged Attention
|
||||
|
||||
- Currently, vLLM utilizes its own implementation of a multi-head query
|
||||
attention kernel (`csrc/attention/attention_kernels.cu`).
|
||||
This kernel is designed to be compatible with
|
||||
vLLM's paged KV caches, where the key and value cache are stored in
|
||||
separate blocks (note that this block concept differs from the GPU
|
||||
thread block. So in a later document, I will refer to vLLM paged
|
||||
attention block as "block", while refer to GPU thread block as
|
||||
"thread block").
|
||||
- To achieve high performance, this kernel relies on a specially
|
||||
designed memory layout and access method, specifically when threads
|
||||
read data from global memory to shared memory. The purpose of this
|
||||
document is to provide a high-level explanation of the kernel
|
||||
implementation step by step, aiding those who wish to learn about the
|
||||
vLLM multi-head query attention kernel. After going through this
|
||||
document, users will likely have a better understanding and feel easier
|
||||
to follow the actual implementation.
|
||||
- Please note that this document may not cover all details, such as how
|
||||
to calculate the correct index for the corresponding data or the dot
|
||||
multiplication implementation. However, after reading this document
|
||||
and becoming familiar with the high-level logic flow, it should be
|
||||
easier for you to read the actual code and understand the details.
|
||||
|
||||
## Inputs
|
||||
|
||||
- The kernel function takes a list of arguments for the current thread
|
||||
to perform its assigned work. The three most important arguments are
|
||||
the input pointers `q`, `k_cache`, and `v_cache`, which point
|
||||
to query, key, and value data on global memory that need to be read
|
||||
and processed. The output pointer `out` points to global memory
|
||||
where the result should be written. These four pointers actually
|
||||
refer to multi-dimensional arrays, but each thread only accesses the
|
||||
portion of data assigned to it. I have omitted all other runtime
|
||||
parameters here for simplicity.
|
||||
|
||||
```cpp
|
||||
template<
|
||||
typename scalar_t,
|
||||
int HEAD_SIZE,
|
||||
int BLOCK_SIZE,
|
||||
int NUM_THREADS,
|
||||
int PARTITION_SIZE = 0>
|
||||
__device__ void paged_attention_kernel(
|
||||
... // Other side args.
|
||||
const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
|
||||
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
|
||||
... // Other side args.
|
||||
)
|
||||
```
|
||||
|
||||
- There are also a list of template arguments above the function
|
||||
signature that are determined during compilation time. `scalar_t`
|
||||
represents the data type of the query, key, and value data elements,
|
||||
such as FP16. `HEAD_SIZE` indicates the number of elements in each
|
||||
head. `BLOCK_SIZE` refers to the number of tokens in each block.
|
||||
`NUM_THREADS` denotes the number of threads in each thread block.
|
||||
`PARTITION_SIZE` represents the number of tensor parallel GPUs (For
|
||||
simplicity, we assume this is 0 and tensor parallel is disabled).
|
||||
|
||||
- With these arguments, we need to perform a sequence of preparations.
|
||||
This includes calculating the current head index, block index, and
|
||||
other necessary variables. However, for now, we can ignore these
|
||||
preparations and proceed directly to the actual calculations. It will
|
||||
be easier to understand them once we grasp the entire flow.
|
||||
|
||||
## Concepts
|
||||
|
||||
- Just before we dive into the calculation flow, I want to describe a
|
||||
few concepts that are needed for later sections. However, you may
|
||||
skip this section and return later if you encounter any confusing
|
||||
terminologies.
|
||||
- **Sequence**: A sequence represents a client request. For example,
|
||||
the data pointed to by `q` has a shape of
|
||||
`[num_seqs, num_heads, head_size]`. That represents there are total
|
||||
`num_seqs` of query sequence data are pointed by `q`. Since this
|
||||
kernel is a single query attention kernel, each sequence only has one
|
||||
query token. Hence, the `num_seqs` equals the total number of tokens
|
||||
that are processed in the batch.
|
||||
- **Context**: The context consists of the generated tokens from the
|
||||
sequence. For instance, `["What", "is", "your"]` are the context
|
||||
tokens, and the input query token is `"name"`. The model might
|
||||
generate the token `"?"`.
|
||||
- **Vec**: The vec is a list of elements that are fetched and
|
||||
calculated together. For query and key data, the vec size
|
||||
(`VEC_SIZE`) is determined so that each thread group can fetch and
|
||||
calculate 16 bytes of data at a time. For value data, the vec size
|
||||
(`V_VEC_SIZE`) is determined so that each thread can fetch and
|
||||
calculate 16 bytes of data at a time. For example, if the
|
||||
`scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
|
||||
`VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
|
||||
- **Thread group**: The thread group is a small group of
|
||||
threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
|
||||
query token and one key token at a time. Each thread handles only a
|
||||
portion of the token data. The total number of elements processed by
|
||||
one thread group is referred as `x`. For example, if the thread
|
||||
group contains 2 threads and the head size is 8, then thread 0
|
||||
handles the query and key elements at index 0, 2, 4, 6, while thread
|
||||
1 handles the elements at index 1, 3, 5, 7.
|
||||
- **Block**: The key and value cache data in vLLM are split into
|
||||
blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
|
||||
of tokens at one head. Each block may contain only a portion of the
|
||||
whole context tokens. For example, if the block size is 16 and the
|
||||
head size is 128, then for one head, one block can store 16 * 128 =
|
||||
2048 elements.
|
||||
- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
|
||||
execute simultaneously on a stream multiprocessor (SM). In this
|
||||
kernel, each warp processes the calculation between one query token
|
||||
and key tokens of one entire block at a time (it may process multiple
|
||||
blocks in multiple iterations). For example, if there are 4 warps and
|
||||
6 blocks for one context, the assignment would be like warp 0 handles
|
||||
the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
|
||||
handles the 2nd block and warp 3 handles the 3rd block.
|
||||
- **Thread block**: A thread block is a group of
|
||||
threads(`NUM_THREADS`) that can access the same shared memory.
|
||||
Each thread block contains multiple warps(`NUM_WARPS`), and in
|
||||
this kernel, each thread block processes the calculation between one
|
||||
query token and key tokens of a whole context.
|
||||
- **Grid**: A grid is a collection of thread blocks and defines the
|
||||
shape of the collection. In this kernel, the shape is
|
||||
`(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
|
||||
block only handles the calculation for one head, one sequence, and
|
||||
one partition.
|
||||
|
||||
## Query
|
||||
|
||||
- This section will introduce how query data is stored in memory and
|
||||
fetched by each thread. As mentioned above, each thread group fetches
|
||||
one query token data, while each thread itself only handles a part of
|
||||
one query token data. Within each warp, every thread group will fetch
|
||||
the same query token data, but will multiply it with different key
|
||||
token data.
|
||||
|
||||
```cpp
|
||||
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
```
|
||||
|
||||
:::{figure} ../../assets/kernel/query.png
|
||||
:align: center
|
||||
:alt: query
|
||||
:width: 70%
|
||||
|
||||
Query data of one token at one head
|
||||
:::
|
||||
|
||||
- Each thread defines its own `q_ptr` which points to the assigned
|
||||
query token data on global memory. For example, if `VEC_SIZE` is 4
|
||||
and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
|
||||
total of 128 elements divided into 128 / 4 = 32 vecs.
|
||||
|
||||
:::{figure} ../../assets/kernel/q_vecs.png
|
||||
:align: center
|
||||
:alt: q_vecs
|
||||
:width: 70%
|
||||
|
||||
`q_vecs` for one thread group
|
||||
:::
|
||||
|
||||
```cpp
|
||||
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
|
||||
```
|
||||
|
||||
- Next, we need to read the global memory data pointed to by `q_ptr`
|
||||
into shared memory as `q_vecs`. It is important to note that each
|
||||
vecs is assigned to a different row. For example, if the
|
||||
`THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
|
||||
while thread 1 handles the 1st row vecs. By reading the query data in
|
||||
this way, neighboring threads like thread 0 and thread 1 can read
|
||||
neighbor memory, achieving the memory coalescing to improve
|
||||
performance.
|
||||
|
||||
## Key
|
||||
|
||||
- Similar to the "Query" section, this section introduces memory layout
|
||||
and assignment for keys. While each thread group only handle one
|
||||
query token one kernel run, it may handle multiple key tokens across
|
||||
multiple iterations. Meanwhile, each warp will process multiple blocks
|
||||
of key tokens in multiple iterations, ensuring that all context
|
||||
tokens are processed by the entire thread group after the kernel run.
|
||||
In this context, "handle" refers to performing the dot multiplication
|
||||
between query data and key data.
|
||||
|
||||
```cpp
|
||||
const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
|
||||
+ kv_head_idx * kv_head_stride
|
||||
+ physical_block_offset * x;
|
||||
```
|
||||
|
||||
- Unlike to `q_ptr`, `k_ptr` in each thread will point to different
|
||||
key token at different iterations. As shown above, that `k_ptr`
|
||||
points to key token data based on `k_cache` at assigned block,
|
||||
assigned head and assigned token.
|
||||
|
||||
:::{figure} ../../assets/kernel/key.png
|
||||
:align: center
|
||||
:alt: key
|
||||
:width: 70%
|
||||
|
||||
Key data of all context tokens at one head
|
||||
:::
|
||||
|
||||
- The diagram above illustrates the memory layout for key data. It
|
||||
assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
|
||||
8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
|
||||
rectangle represents all the elements for one key token at one head,
|
||||
which will be processed by one thread group. The left half shows the
|
||||
total 16 blocks of key token data for warp 0, while the right half
|
||||
represents the remaining key token data for other warps or
|
||||
iterations. Inside each rectangle, there are a total 32 vecs (128
|
||||
elements for one token) that will be processed by 2 threads (one
|
||||
thread group) separately.
|
||||
|
||||
:::{figure} ../../assets/kernel/k_vecs.png
|
||||
:align: center
|
||||
:alt: k_vecs
|
||||
:width: 70%
|
||||
|
||||
`k_vecs` for one thread
|
||||
:::
|
||||
|
||||
```cpp
|
||||
K_vec k_vecs[NUM_VECS_PER_THREAD]
|
||||
```
|
||||
|
||||
- Next, we need to read the key token data from `k_ptr` and store
|
||||
them on register memory as `k_vecs`. We use register memory for
|
||||
`k_vecs` because it will only be accessed by one thread once,
|
||||
whereas `q_vecs` will be accessed by multiple threads multiple
|
||||
times. Each `k_vecs` will contain multiple vectors for later
|
||||
calculation. Each vec will be set at each inner iteration. The
|
||||
assignment of vecs allows neighboring threads in a warp to read
|
||||
neighboring memory together, which again promotes the memory
|
||||
coalescing. For instance, thread 0 will read vec 0, while thread 1
|
||||
will read vec 1. In the next inner loop, thread 0 will read vec 2,
|
||||
while thread 1 will read vec 3, and so on.
|
||||
|
||||
- You may still be a little confused about the overall flow. Don't
|
||||
worry, please keep reading the next "QK" section. It will illustrate
|
||||
the query and key calculation flow in a clearer and higher-level
|
||||
manner.
|
||||
|
||||
## QK
|
||||
|
||||
- As shown the pseudo code below, before the entire for loop block, we
|
||||
fetch the query data for one token and store it in `q_vecs`. Then,
|
||||
in the outer for loop, we iterate through different `k_ptrs` that
|
||||
point to different tokens and prepare the `k_vecs` in the inner for
|
||||
loop. Finally, we perform the dot multiplication between the
|
||||
`q_vecs` and each `k_vecs`.
|
||||
|
||||
```cpp
|
||||
q_vecs = ...
|
||||
for ... {
|
||||
k_ptr = ...
|
||||
for ... {
|
||||
k_vecs[i] = ...
|
||||
}
|
||||
...
|
||||
float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
|
||||
}
|
||||
```
|
||||
|
||||
- As mentioned before, for each thread, it only fetches part of the
|
||||
query and key token data at a time. However, there will be a cross
|
||||
thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
|
||||
returned here is not just between part of the query and key token dot
|
||||
multiplication, but actually a full result between entire query and
|
||||
key token data.
|
||||
|
||||
- For example, if the value of `HEAD_SIZE` is 128 and
|
||||
`THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
|
||||
total 64 elements. However, the returned `qk` is actually the
|
||||
result of dot multiplication between 128 query elements and 128 key
|
||||
elements. If you want to learn more about the details of the dot
|
||||
multiplication and reduction, you may refer to the implementation of
|
||||
`Qk_dot<>::dot`. However, for the sake of simplicity, I will not
|
||||
cover it in this document.
|
||||
|
||||
## Softmax
|
||||
|
||||
- Next, we need to calculate the normalized softmax for all `qk`s,
|
||||
as shown above, where each $x$ represents a `qk`. To do this,
|
||||
we must obtain the reduced value of `qk_max`($m(x)$) and
|
||||
the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
|
||||
should be performed across the entire thread block, encompassing
|
||||
results between the query token and all context key tokens.
|
||||
|
||||
:::{math}
|
||||
:nowrap: true
|
||||
|
||||
\begin{gather*}
|
||||
m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
|
||||
\quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
|
||||
\end{gather*}
|
||||
:::
|
||||
|
||||
### `qk_max` and `logits`
|
||||
|
||||
- Just right after we get the `qk` result, we can set the temporary
|
||||
`logits` result with `qk` (In the end, the `logits` should
|
||||
store the normalized softmax result). Also we can compare and collect
|
||||
the `qk_max` for all `qk`s that are calculated by current
|
||||
thread group.
|
||||
|
||||
```cpp
|
||||
if (thread_group_offset == 0) {
|
||||
const bool mask = token_idx >= context_len;
|
||||
logits[token_idx - start_token_idx] = mask ? 0.f : qk;
|
||||
qk_max = mask ? qk_max : fmaxf(qk_max, qk);
|
||||
}
|
||||
```
|
||||
|
||||
- Please note that the `logits` here is on shared memory, so each
|
||||
thread group will set the fields for its own assigned context tokens.
|
||||
Overall, the size of logits should be number of context tokens.
|
||||
|
||||
```cpp
|
||||
for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
|
||||
qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
|
||||
}
|
||||
|
||||
if (lane == 0) {
|
||||
red_smem[warp_idx] = qk_max;
|
||||
}
|
||||
```
|
||||
|
||||
- Then we need to get the reduced `qk_max` across each warp. The main
|
||||
idea is to make threads in warp to communicate with each other and
|
||||
get the final max `qk` .
|
||||
|
||||
```cpp
|
||||
for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
|
||||
qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
|
||||
}
|
||||
qk_max = VLLM_SHFL_SYNC(qk_max, 0);
|
||||
```
|
||||
|
||||
- Finally, we can get the reduced `qk_max` from whole thread block by
|
||||
compare the `qk_max` from all warps in this thread block. Then we
|
||||
need to broadcast the final result to each thread.
|
||||
|
||||
### `exp_sum`
|
||||
|
||||
- Similar to `qk_max`, we need to get the reduced sum value from the
|
||||
entire thread block too.
|
||||
|
||||
```cpp
|
||||
for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
|
||||
float val = __expf(logits[i] - qk_max);
|
||||
logits[i] = val;
|
||||
exp_sum += val;
|
||||
}
|
||||
...
|
||||
exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
|
||||
```
|
||||
|
||||
- Firstly, sum all exp values from each thread group, and meanwhile,
|
||||
convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
|
||||
Please note, the `qk_max` here is already the max `qk` across the
|
||||
whole thread block. And then we can do reduction for `exp_sum`
|
||||
across whole thread block just like the `qk_max`.
|
||||
|
||||
```cpp
|
||||
const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
|
||||
for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
|
||||
logits[i] *= inv_sum;
|
||||
}
|
||||
```
|
||||
|
||||
- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
|
||||
the final normalized softmax result as `logits`. This `logits`
|
||||
variable will be used for dot multiplication with the value data in
|
||||
later steps. Now, it should store the normalized softmax result of
|
||||
`qk` for all assigned context tokens.
|
||||
|
||||
## Value
|
||||
|
||||
:::{figure} ../../assets/kernel/value.png
|
||||
:align: center
|
||||
:alt: value
|
||||
:width: 70%
|
||||
|
||||
Value data of all context tokens at one head
|
||||
:::
|
||||
|
||||
:::{figure} ../../assets/kernel/logits_vec.png
|
||||
:align: center
|
||||
:alt: logits_vec
|
||||
:width: 50%
|
||||
|
||||
`logits_vec` for one thread
|
||||
:::
|
||||
|
||||
:::{figure} ../../assets/kernel/v_vec.png
|
||||
:align: center
|
||||
:alt: v_vec
|
||||
:width: 70%
|
||||
|
||||
List of `v_vec` for one thread
|
||||
:::
|
||||
|
||||
- Now we need to retrieve the value data and perform dot multiplication
|
||||
with `logits`. Unlike query and key, there is no thread group
|
||||
concept for value data. As shown in diagram, different from key token
|
||||
memory layout, elements from the same column correspond to the same
|
||||
value token. For one block of value data, there are `HEAD_SIZE` of
|
||||
rows and `BLOCK_SIZE` of columns that are split into multiple
|
||||
`v_vecs`.
|
||||
|
||||
- Each thread always fetches `V_VEC_SIZE` elements from the same
|
||||
`V_VEC_SIZE` of tokens at a time. As a result, a single thread
|
||||
retrieves multiple `v_vec`s from different rows and the same
|
||||
columns through multiple inner iterations. For each `v_vec`, it
|
||||
needs to be dot multiplied with the corresponding `logits_vec`,
|
||||
which is also `V_VEC_SIZE` elements from `logits`. Overall, with
|
||||
multiple inner iterations, each warp will process one block of value
|
||||
tokens. And with multiple outer iterations, the whole context value
|
||||
tokens are processed
|
||||
|
||||
```cpp
|
||||
float accs[NUM_ROWS_PER_THREAD];
|
||||
for ... { // Iteration over different blocks.
|
||||
logits_vec = ...
|
||||
for ... { // Iteration over different rows.
|
||||
v_vec = ...
|
||||
...
|
||||
accs[i] += dot(logits_vec, v_vec);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- As shown in the above pseudo code, in the outer loop, similar to
|
||||
`k_ptr`, `logits_vec` iterates over different blocks and reads
|
||||
`V_VEC_SIZE` elements from `logits`. In the inner loop, each
|
||||
thread reads `V_VEC_SIZE` elements from the same tokens as a
|
||||
`v_vec` and performs dot multiplication. It is important to note
|
||||
that in each inner iteration, the thread fetches different head
|
||||
position elements for the same tokens. The dot result is then
|
||||
accumulated in `accs`. Therefore, each entry of `accs` is mapped
|
||||
to a head position assigned to the current thread.
|
||||
|
||||
- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
|
||||
thread fetches 8 value elements for 8 tokens at a time. Each element
|
||||
is from different tokens at the same head position. If `HEAD_SIZE`
|
||||
is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
|
||||
fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
|
||||
a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
|
||||
a whole block of value tokens. And each `accs` in each thread
|
||||
contains 8 elements that accumulated at 8 different head positions.
|
||||
For the thread 0, the `accs` variable will have 8 elements, which
|
||||
are 0th, 32th … 224th elements of a value head that are accumulated
|
||||
from all assigned 8 tokens.
|
||||
|
||||
## LV
|
||||
|
||||
- Now, we need to perform reduction for `accs` within each warp. This
|
||||
process allows each thread to accumulate the `accs` for the
|
||||
assigned head positions of all tokens in one block.
|
||||
|
||||
```cpp
|
||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||
float acc = accs[i];
|
||||
for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
|
||||
acc += VLLM_SHFL_XOR_SYNC(acc, mask);
|
||||
}
|
||||
accs[i] = acc;
|
||||
}
|
||||
```
|
||||
|
||||
- Next, we perform reduction for `accs` across all warps, allowing
|
||||
each thread to have the accumulation of `accs` for the assigned
|
||||
head positions of all context tokens. Please note that each `accs`
|
||||
in every thread only stores the accumulation for a portion of
|
||||
elements of the entire head for all context tokens. However, overall,
|
||||
all results for output have been calculated but are just stored in
|
||||
different thread register memory.
|
||||
|
||||
```cpp
|
||||
float* out_smem = reinterpret_cast<float*>(shared_mem);
|
||||
for (int i = NUM_WARPS; i > 1; i /= 2) {
|
||||
// Upper warps write to shared memory.
|
||||
...
|
||||
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
|
||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||
...
|
||||
dst[row_idx] = accs[i];
|
||||
}
|
||||
|
||||
// Lower warps update the output.
|
||||
const float* src = &out_smem[warp_idx * HEAD_SIZE];
|
||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||
...
|
||||
accs[i] += src[row_idx];
|
||||
}
|
||||
|
||||
// Write out the accs.
|
||||
}
|
||||
```
|
||||
|
||||
## Output
|
||||
|
||||
- Now we can write all of calculated result from local register memory
|
||||
to final output global memory.
|
||||
|
||||
```cpp
|
||||
scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
|
||||
+ head_idx * max_num_partitions * HEAD_SIZE
|
||||
+ partition_idx * HEAD_SIZE;
|
||||
```
|
||||
|
||||
- First, we need to define the `out_ptr` variable, which points to
|
||||
the start address of the assigned sequence and assigned head.
|
||||
|
||||
```cpp
|
||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
|
||||
if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
|
||||
from_float(*(out_ptr + row_idx), accs[i]);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- Finally, we need to iterate over different assigned head positions
|
||||
and write out the corresponding accumulated result based on the
|
||||
`out_ptr`.
|
||||
@@ -1,69 +0,0 @@
|
||||
(mm-processing)=
|
||||
|
||||
# Multi-Modal Data Processing
|
||||
|
||||
To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
|
||||
|
||||
Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`:
|
||||
|
||||
## Prompt Update Detection
|
||||
|
||||
One of the main responsibilities of HF processor is to update the prompt with placeholder tokens. For example:
|
||||
|
||||
- Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
|
||||
- Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
|
||||
|
||||
The information about which tokens have been updated is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
|
||||
|
||||
In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptUpdate` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens.
|
||||
|
||||
## Tokenized Prompt Inputs
|
||||
|
||||
To enable tokenization in a separate process, we support passing input token IDs alongside multi-modal data.
|
||||
|
||||
### The problem
|
||||
|
||||
Consider that HF processors follow these main steps:
|
||||
|
||||
1. Tokenize the text
|
||||
2. Process multi-modal inputs
|
||||
3. Perform prompt updates
|
||||
|
||||
And we require that:
|
||||
|
||||
- For text + multi-modal inputs, apply all steps 1--3.
|
||||
- For tokenized + multi-modal inputs, apply only steps 2--3.
|
||||
|
||||
How can we achieve this without rewriting HF processors? We can try to call the HF processor several times on different inputs:
|
||||
|
||||
- For text + multi-modal inputs, simply call the HF processor directly.
|
||||
- For tokenized + multi-modal inputs, call the processor only on the multi-modal inputs.
|
||||
|
||||
While HF processors support text + multi-modal inputs natively, this is not so for tokenized + multi-modal inputs: an error is thrown if the number of input placeholder tokens do not correspond to the number of multi-modal inputs.
|
||||
|
||||
Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
|
||||
|
||||
(mm-dummy-text)=
|
||||
|
||||
### Dummy text
|
||||
|
||||
We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
|
||||
|
||||
(mm-automatic-prompt-updating)=
|
||||
|
||||
### Automatic prompt updating
|
||||
|
||||
We address the second issue by implementing model-agnostic code in
|
||||
{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates` to automatically update the prompt with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
|
||||
|
||||
### Summary
|
||||
|
||||
With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
|
||||
|
||||
## Processor Output Caching
|
||||
|
||||
Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
|
||||
|
||||
When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
|
||||
|
||||
Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#mm-automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
|
||||
@@ -1,196 +0,0 @@
|
||||
# Python Multiprocessing
|
||||
|
||||
## Debugging
|
||||
|
||||
Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
|
||||
page for information on known issues and how to solve them.
|
||||
|
||||
## Introduction
|
||||
|
||||
:::{important}
|
||||
The source code references are to the state of the code at the time of writing in December, 2024.
|
||||
:::
|
||||
|
||||
The use of Python multiprocessing in vLLM is complicated by:
|
||||
|
||||
- The use of vLLM as a library and the inability to control the code using vLLM
|
||||
- Varying levels of incompatibilities between multiprocessing methods and vLLM
|
||||
dependencies
|
||||
|
||||
This document describes how vLLM deals with these challenges.
|
||||
|
||||
## Multiprocessing Methods
|
||||
|
||||
[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
|
||||
|
||||
- `spawn` - spawn a new Python process. This will be the default as of Python
|
||||
3.14. In macOS, this is already the default.
|
||||
|
||||
- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
|
||||
in Python versions prior to 3.14.
|
||||
|
||||
- `forkserver` - Spawn a server process that will fork a new process on request.
|
||||
|
||||
### Tradeoffs
|
||||
|
||||
`fork` is the fastest method, but is incompatible with dependencies that use
|
||||
threads. If you are under macOS, using `fork` may cause the process to crash.
|
||||
|
||||
`spawn` is more compatible with dependencies, but can be problematic when vLLM
|
||||
is used as a library. If the consuming code does not use a `__main__` guard (`if
|
||||
__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
|
||||
spawns a new process. This can lead to infinite recursion, among other problems.
|
||||
|
||||
`forkserver` will spawn a new server process that will fork new processes on
|
||||
demand. This unfortunately has the same problem as `spawn` when vLLM is used as
|
||||
a library. The server process is created as a spawned new process, which will
|
||||
re-execute code not protected by a `__main__` guard.
|
||||
|
||||
For both `spawn` and `forkserver`, the process must not depend on inheriting any
|
||||
global state as would be the case with `fork`.
|
||||
|
||||
## Compatibility with Dependencies
|
||||
|
||||
Multiple vLLM dependencies indicate either a preference or requirement for using
|
||||
`spawn`:
|
||||
|
||||
- <https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing>
|
||||
- <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
|
||||
- <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
|
||||
|
||||
It is perhaps more accurate to say that there are known problems with using
|
||||
`fork` after initializing these dependencies.
|
||||
|
||||
## Current State (v0)
|
||||
|
||||
The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`.
|
||||
|
||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
|
||||
|
||||
When we know we own the process because the `vllm` command was used, we use
|
||||
`spawn` because it's the most widely compatible.
|
||||
|
||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
|
||||
|
||||
The `multiproc_xpu_executor` forces the use of `spawn`.
|
||||
|
||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/executor/multiproc_xpu_executor.py#L14-L18>
|
||||
|
||||
There are other miscellaneous places hard-coding the use of `spawn`:
|
||||
|
||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L135>
|
||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
|
||||
|
||||
Related PRs:
|
||||
|
||||
- <gh-pr:8823>
|
||||
|
||||
## Prior State in v1
|
||||
|
||||
There was an environment variable to control whether multiprocessing is used in
|
||||
the v1 engine core, `VLLM_ENABLE_V1_MULTIPROCESSING`. This defaulted to off.
|
||||
|
||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L452-L454>
|
||||
|
||||
When it was enabled, the v1 `LLMEngine` would create a new process to run the
|
||||
engine core.
|
||||
|
||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L93-L95>
|
||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L70-L77>
|
||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45>
|
||||
|
||||
It was off by default for all the reasons mentioned above - compatibility with
|
||||
dependencies and code using vLLM as a library.
|
||||
|
||||
### Changes Made in v1
|
||||
|
||||
There is not an easy solution with Python's `multiprocessing` that will work
|
||||
everywhere. As a first step, we can get v1 into a state where it does "best
|
||||
effort" choice of multiprocessing method to maximize compatibility.
|
||||
|
||||
- Default to `fork`.
|
||||
- Use `spawn` when we know we control the main process (`vllm` was executed).
|
||||
- If we detect `cuda` was previously initialized, force `spawn` and emit a
|
||||
warning. We know `fork` will break, so this is the best we can do.
|
||||
|
||||
The case that is known to still break in this scenario is code using vLLM as a
|
||||
library that initializes `cuda` before calling vLLM. The warning we emit should
|
||||
instruct users to either add a `__main__` guard or to disable multiprocessing.
|
||||
|
||||
If that known-failure case occurs, the user will see two messages that explain
|
||||
what is happening. First, a log message from vLLM:
|
||||
|
||||
```console
|
||||
WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
|
||||
initialized. We must use the `spawn` multiprocessing start method. Setting
|
||||
VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
|
||||
https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
|
||||
for more information.
|
||||
```
|
||||
|
||||
Second, Python itself will raise an exception with a nice explanation:
|
||||
|
||||
```console
|
||||
RuntimeError:
|
||||
An attempt has been made to start a new process before the
|
||||
current process has finished its bootstrapping phase.
|
||||
|
||||
This probably means that you are not using fork to start your
|
||||
child processes and you have forgotten to use the proper idiom
|
||||
in the main module:
|
||||
|
||||
if __name__ == '__main__':
|
||||
freeze_support()
|
||||
...
|
||||
|
||||
The "freeze_support()" line can be omitted if the program
|
||||
is not going to be frozen to produce an executable.
|
||||
|
||||
To fix this issue, refer to the "Safe importing of main module"
|
||||
section in https://docs.python.org/3/library/multiprocessing.html
|
||||
```
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
### Detect if a `__main__` guard is present
|
||||
|
||||
It has been suggested that we could behave better if we could detect whether
|
||||
code using vLLM as a library has a `__main__` guard in place. This [post on
|
||||
stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
|
||||
was from a library author facing the same question.
|
||||
|
||||
It is possible to detect whether we are in the original, `__main__` process, or
|
||||
a subsequent spawned process. However, it does not appear to be straight forward
|
||||
to detect whether a `__main__` guard is present in the code.
|
||||
|
||||
This option has been discarded as impractical.
|
||||
|
||||
### Use `forkserver`
|
||||
|
||||
At first it appears that `forkserver` is a nice solution to the problem.
|
||||
However, the way it works presents the same challenges that `spawn` does when
|
||||
vLLM is used as a library.
|
||||
|
||||
### Force `spawn` all the time
|
||||
|
||||
One way to clean this up is to just force the use of `spawn` all the time and
|
||||
document that the use of a `__main__` guard is required when using vLLM as a
|
||||
library. This would unfortunately break existing code and make vLLM harder to
|
||||
use, violating the desire to make the `LLM` class as easy as possible to use.
|
||||
|
||||
Instead of pushing this on our users, we will retain the complexity to do our
|
||||
best to make things work.
|
||||
|
||||
## Future Work
|
||||
|
||||
We may want to consider a different worker management approach in the future
|
||||
that works around these challenges.
|
||||
|
||||
1. We could implement something `forkserver`-like, but have the process manager
|
||||
be something we initially launch by running our own subprocess and a custom
|
||||
entrypoint for worker management (launch a `vllm-manager` process).
|
||||
|
||||
2. We can explore other libraries that may better suit our needs. Examples to
|
||||
consider:
|
||||
|
||||
- <https://github.com/joblib/loky>
|
||||
@@ -1,56 +0,0 @@
|
||||
(plugin-system)=
|
||||
|
||||
# vLLM's Plugin System
|
||||
|
||||
The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
|
||||
|
||||
## How Plugins Work in vLLM
|
||||
|
||||
Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
|
||||
|
||||
## How vLLM Discovers Plugins
|
||||
|
||||
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
|
||||
|
||||
```python
|
||||
# inside `setup.py` file
|
||||
from setuptools import setup
|
||||
|
||||
setup(name='vllm_add_dummy_model',
|
||||
version='0.1',
|
||||
packages=['vllm_add_dummy_model'],
|
||||
entry_points={
|
||||
'vllm.general_plugins':
|
||||
["register_dummy_model = vllm_add_dummy_model:register"]
|
||||
})
|
||||
|
||||
# inside `vllm_add_dummy_model.py` file
|
||||
def register():
|
||||
from vllm import ModelRegistry
|
||||
|
||||
if "MyLlava" not in ModelRegistry.get_supported_archs():
|
||||
ModelRegistry.register_model("MyLlava",
|
||||
"vllm_add_dummy_model.my_llava:MyLlava")
|
||||
```
|
||||
|
||||
For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
|
||||
|
||||
Every plugin has three parts:
|
||||
|
||||
1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
|
||||
2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
|
||||
3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
|
||||
|
||||
## Types of supported plugins
|
||||
|
||||
- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
|
||||
|
||||
- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
|
||||
|
||||
## Guidelines for Writing Plugins
|
||||
|
||||
- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
|
||||
|
||||
## Compatibility Guarantee
|
||||
|
||||
vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
|
||||
@@ -1,726 +0,0 @@
|
||||
# Metrics
|
||||
|
||||
Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
|
||||
|
||||
## Objectives
|
||||
|
||||
- Achieve parity of metrics between v0 and v1.
|
||||
- The priority use case is accessing these metrics via Prometheus as this is what we expect to be used in production environments.
|
||||
- Logging support - i.e. printing metrics to the info log - is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
|
||||
|
||||
## Background
|
||||
|
||||
Metrics in vLLM can be categorized as follows:
|
||||
|
||||
1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
|
||||
2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
|
||||
|
||||
The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are.
|
||||
|
||||
### v0 Metrics
|
||||
|
||||
In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
||||
|
||||
- `vllm:num_requests_running` (Gauge)
|
||||
- `vllm:num_requests_swapped` (Gauge)
|
||||
- `vllm:num_requests_waiting` (Gauge)
|
||||
- `vllm:gpu_cache_usage_perc` (Gauge)
|
||||
- `vllm:cpu_cache_usage_perc` (Gauge)
|
||||
- `vllm:gpu_prefix_cache_hit_rate` (Gauge)
|
||||
- `vllm:cpu_prefix_cache_hit_rate` (Gauge)
|
||||
- `vllm:prompt_tokens_total` (Counter)
|
||||
- `vllm:generation_tokens_total` (Counter)
|
||||
- `vllm:request_success_total` (Counter)
|
||||
- `vllm:request_prompt_tokens` (Histogram)
|
||||
- `vllm:request_generation_tokens` (Histogram)
|
||||
- `vllm:time_to_first_token_seconds` (Histogram)
|
||||
- `vllm:time_per_output_token_seconds` (Histogram)
|
||||
- `vllm:e2e_request_latency_seconds` (Histogram)
|
||||
- `vllm:request_queue_time_seconds` (Histogram)
|
||||
- `vllm:request_inference_time_seconds` (Histogram)
|
||||
- `vllm:request_prefill_time_seconds` (Histogram)
|
||||
- `vllm:request_decode_time_seconds` (Histogram)
|
||||
- `vllm:request_max_num_generation_tokens` (Histogram)
|
||||
- `vllm:num_preemptions_total` (Counter)
|
||||
- `vllm:cache_config_info` (Gauge)
|
||||
- `vllm:lora_requests_info` (Gauge)
|
||||
- `vllm:tokens_total` (Counter)
|
||||
- `vllm:iteration_tokens_total` (Histogram)
|
||||
- `vllm:time_in_queue_requests` (Histogram)
|
||||
- `vllm:model_forward_time_milliseconds` (Histogram)
|
||||
- `vllm:model_execute_time_milliseconds` (Histogram)
|
||||
- `vllm:request_params_n` (Histogram)
|
||||
- `vllm:request_params_max_tokens` (Histogram)
|
||||
- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
|
||||
- `vllm:spec_decode_efficiency` (Gauge)
|
||||
- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
||||
|
||||
These are documented under [Inferencing and Serving -> Production Metrics](project:../../serving/metrics.md).
|
||||
|
||||
### Grafana Dashboard
|
||||
|
||||
vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_started/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||
|
||||
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
|
||||
|
||||
- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds
|
||||
- `vllm:prompt_tokens_total` - Prompt Tokens
|
||||
- `vllm:generation_tokens_total` - Generation Tokens
|
||||
- `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second.
|
||||
- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
|
||||
- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state
|
||||
- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
|
||||
- `vllm:request_prompt_tokens` - Request prompt length
|
||||
- `vllm:request_generation_tokens` - request generation length
|
||||
- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached
|
||||
- `vllm:request_queue_time_seconds` - Queue Time
|
||||
- `vllm:request_prefill_time_seconds` - Requests Prefill Time
|
||||
- `vllm:request_decode_time_seconds` - Requests Decode Time
|
||||
- `vllm:request_max_num_generation_tokens` - Max Generation Token in Sequence Group
|
||||
|
||||
See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
|
||||
|
||||
### Prometheus Client Library
|
||||
|
||||
Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
|
||||
|
||||
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
|
||||
|
||||
```bash
|
||||
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
||||
http_requests_total{handler="/v1/completions",method="POST",status="2xx"} 201.0
|
||||
http_request_size_bytes_count{handler="/v1/completions"} 201.0
|
||||
http_response_size_bytes_count{handler="/v1/completions"} 201.0
|
||||
http_request_duration_highr_seconds_count 201.0
|
||||
http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
|
||||
```
|
||||
|
||||
### Multi-process Mode
|
||||
|
||||
In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
|
||||
|
||||
### Built in Python/Process Metrics
|
||||
|
||||
The following metrics are supported by default by `prometheus_client`, but the are not exposed with multiprocess mode is used:
|
||||
|
||||
- `python_gc_objects_collected_total`
|
||||
- `python_gc_objects_uncollectable_total`
|
||||
- `python_gc_collections_total`
|
||||
- `python_info`
|
||||
- `process_virtual_memory_bytes`
|
||||
- `process_resident_memory_bytes`
|
||||
- `process_start_time_seconds`
|
||||
- `process_cpu_seconds_total`
|
||||
- `process_open_fds`
|
||||
- `process_max_fds`
|
||||
|
||||
This is relevant because if we move away from multiprocess mode in v1,
|
||||
we get these back. However, it's questionable how relevant these are
|
||||
if they don't aggregate these stats for all processes that make up a
|
||||
vLLM instance.
|
||||
|
||||
### v0 PRs and Issues
|
||||
|
||||
For background, these are some of the relevant PRs which added the v0 metrics:
|
||||
|
||||
- <gh-pr:1890>
|
||||
- <gh-pr:2316>
|
||||
- <gh-pr:2730>
|
||||
- <gh-pr:4464>
|
||||
- <gh-pr:7279>
|
||||
|
||||
Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
|
||||
|
||||
## v1 Design
|
||||
|
||||
### v1 PRs
|
||||
|
||||
For background, here are the relevant v1 PRs relating to the v1
|
||||
metrics issue <gh-issue:10582>:
|
||||
|
||||
- <gh-pr:11962>
|
||||
- <gh-pr:11973>
|
||||
- <gh-pr:10907>
|
||||
- <gh-pr:12416>
|
||||
- <gh-pr:12478>
|
||||
- <gh-pr:12516>
|
||||
- <gh-pr:12530>
|
||||
- <gh-pr:12561>
|
||||
- <gh-pr:12579>
|
||||
- <gh-pr:12592>
|
||||
- <gh-pr:12644>
|
||||
|
||||
### Metrics Collection
|
||||
|
||||
In v1, we wish to move computation and overhead out of the engine core
|
||||
process to minimize the time between each forward pass.
|
||||
|
||||
The overall idea of V1 EngineCore design is:
|
||||
- EngineCore is the inner loop. Performance is most critical here
|
||||
- AsyncLLM is the outer loop. This is overlapped with GPU execution
|
||||
(ideally), so this is where any "overheads" should be if
|
||||
possible. So AsyncLLM.output_handler_loop is the ideal place for the
|
||||
metrics bookkeeping if possible.
|
||||
|
||||
We will achieve this by collecting metrics in the frontend API server,
|
||||
and base these metrics on information we can glean from the
|
||||
`EngineCoreOutputs` returned by the engine core process to the
|
||||
frontend.
|
||||
|
||||
### Interval Calculations
|
||||
|
||||
Many of our metrics are the time interval between various events in
|
||||
the processing of a request. It is best practice to use timestamps
|
||||
based on "monotonic time" (`time.monotonic()`) rather than "wall-clock
|
||||
time" (`time.time()`) to calculate intervals as the former is
|
||||
unaffected by system clock changes (e.g. from NTP).
|
||||
|
||||
It's also important to note that monotonic clocks differ between
|
||||
processes - each process has its own reference. point. So it is
|
||||
meaningless to compare monotonic timestamps from different processes.
|
||||
|
||||
Therefore, in order to calculate an interval, we must compare two
|
||||
monotonic timestamps from the same process.
|
||||
|
||||
### Scheduler Stats
|
||||
|
||||
The engine core process will collect some key statistics from the
|
||||
scheduler - e.g. the number of requests that were scheduled or waiting
|
||||
after the last scheduler pass - and include those statistics in
|
||||
`EngineCoreOutputs`.
|
||||
|
||||
### Engine Core Events
|
||||
|
||||
The engine core will also record the timestamp of certain per-request
|
||||
events so that the frontend can calculate the interval between these
|
||||
events.
|
||||
|
||||
The events are:
|
||||
|
||||
- `QUEUED` - when the request was received by the engine core and
|
||||
added to the scheduler queue.
|
||||
- `SCHEDULED` - when the request was first scheduled for execution.
|
||||
- `PREEMPTED` - the request has been put back in the waiting queue
|
||||
in order to make room for other requests to complete. It will be
|
||||
re-scheduled in future and re-start its prefill phase.
|
||||
- `NEW_TOKENS` - when the output included in `EngineCoreOutput` was
|
||||
generated. Since this is common to all requests in a given
|
||||
iteration, we use a single timestamp on `EngineCoreOutputs` to
|
||||
record this event.
|
||||
|
||||
And the calculated intervals are:
|
||||
|
||||
- Queue interval - between `QUEUED` and most recent `SCHEDULED`.
|
||||
- Prefill interval - between most recent `SCHEDULED` and the subsequent
|
||||
first `NEW_TOKENS`.
|
||||
- Decode interval - between first (after the most recent `SCHEDULED`) and
|
||||
last `NEW_TOKENS`.
|
||||
- Inference interval - between most recent `SCHEDULED` and last `NEW_TOKENS`.
|
||||
- Inter-token interval - between successive `NEW_TOKENS`.
|
||||
|
||||
Put another way:
|
||||
|
||||
:::{image} /assets/design/v1/metrics/intervals-1.png
|
||||
:alt: Interval calculations - common case
|
||||
:::
|
||||
|
||||
We explored the possibility of having the frontend calculate these
|
||||
intervals using the timing of events visible by the frontend. However,
|
||||
the frontend does not have visibility into the timing of the `QUEUED`
|
||||
and `SCHEDULED` events and, since we need to calculate intervals based
|
||||
on monotonic timestamps from the same process ... we need the engine
|
||||
core to record timestamps for all of these events.
|
||||
|
||||
#### Interval Calculations vs Preemptions
|
||||
|
||||
When a preemption occurs during decode, since any already generated
|
||||
tokens are reused, we consider the preemption as affecting the
|
||||
inter-token, decode, and inference intervals.
|
||||
|
||||
:::{image} /assets/design/v1/metrics/intervals-2.png
|
||||
:alt: Interval calculations - preempted decode
|
||||
:::
|
||||
|
||||
When a preemption occurs during prefill (assuming such an event
|
||||
is possible), we consider the preemption as affecting the
|
||||
time-to-first-token and prefill intervals.
|
||||
|
||||
:::{image} /assets/design/v1/metrics/intervals-3.png
|
||||
:alt: Interval calculations - preempted prefill
|
||||
:::
|
||||
|
||||
### Frontend Stats Collection
|
||||
|
||||
As the frontend processes a single `EngineCoreOutputs` - i.e. the
|
||||
output from a single engine core iteration - it collects various
|
||||
statistics relating to that iteration:
|
||||
|
||||
- The total number of new tokens generated in this iteration.
|
||||
- The total number of prompt tokens processed by the prefills that
|
||||
completed in this iteration.
|
||||
- The queue intervals for any requests that were scheduled in this
|
||||
iteration.
|
||||
- The prefill intervals for any requests that completed prefill in
|
||||
this iteration.
|
||||
- The inter-token intervals (Time Per Output Token, TPOT), for all
|
||||
requests included in this iteration.
|
||||
- The Time-To-First-Token (TTFT) for any requests that completed
|
||||
prefill in this iteration. However, we calculate this interval
|
||||
relative to when the request was first received by the frontend
|
||||
(`arrival_time`) in order to account for input processing time.
|
||||
|
||||
For any requests that were completed in a given iteration, we also
|
||||
record:
|
||||
|
||||
- The inference and decode intervals - relative to the scheduled and
|
||||
first token events, as described above.
|
||||
- End-to-end latency - the interval between frontend `arrival_time`
|
||||
and the frontend receiving the final token.
|
||||
|
||||
### Metrics Publishing - Logging
|
||||
|
||||
The `LoggingStatLogger` metrics publisher outputs a log `INFO` message
|
||||
every 5 seconds with some key metrics:
|
||||
|
||||
- The current number of running/waiting requests
|
||||
- The current GPU cache usage
|
||||
- The number of prompt tokens processed per second over the past 5
|
||||
seconds
|
||||
- The number of new tokens generated per second over the past 5
|
||||
seconds
|
||||
- The prefix cache hit rate over the most recent 1k kv-cache block queries
|
||||
|
||||
### Metrics Publishing - Prometheus
|
||||
|
||||
The `PrometheusStatLogger` metrics publisher makes the metrics
|
||||
available via a `/metrics` HTTP endpoint in a Prometheus-compatible
|
||||
format. A Prometheus instance can then be configured to poll this
|
||||
endpoint (e.g. every second) and record the values in its time-series
|
||||
database. Prometheus is often used via Grafana, allowing these metrics
|
||||
to be graphed over time.
|
||||
|
||||
Prometheus supports the following metric types:
|
||||
|
||||
- Counter: a value that will increase over time, never reducing, and
|
||||
generally reset to zero when the vLLM instance restarts. For
|
||||
example, the number of tokens generated over the lifetime of the
|
||||
instance.
|
||||
- Gauge: a value that goes up and down, for example the number of
|
||||
requests currently scheduled for execution.
|
||||
- Histogram: a count of metric samples, recorded in buckets. For
|
||||
example, the number of requests whose TTFT was <1ms, <5ms, <10ms,
|
||||
<20ms, and so on.
|
||||
|
||||
Prometheus metrics can also be labelled, allowing metrics to be
|
||||
combined according to matching labels. In vLLM, we add a `model_name`
|
||||
label to every metric which includes the name of the model served by
|
||||
that instance.
|
||||
|
||||
Example output:
|
||||
|
||||
```bash
|
||||
$ curl http://0.0.0.0:8000/metrics
|
||||
# HELP vllm:num_requests_running Number of requests in model execution batches.
|
||||
# TYPE vllm:num_requests_running gauge
|
||||
vllm:num_requests_running{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.0
|
||||
...
|
||||
# HELP vllm:generation_tokens_total Number of generation tokens processed.
|
||||
# TYPE vllm:generation_tokens_total counter
|
||||
vllm:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 27453.0
|
||||
...
|
||||
# HELP vllm:request_success_total Count of successfully processed requests.
|
||||
# TYPE vllm:request_success_total counter
|
||||
vllm:request_success_total{finished_reason="stop",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
||||
vllm:request_success_total{finished_reason="length",model_name="meta-llama/Llama-3.1-8B-Instruct"} 131.0
|
||||
vllm:request_success_total{finished_reason="abort",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
||||
...
|
||||
# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
|
||||
# TYPE vllm:time_to_first_token_seconds histogram
|
||||
vllm:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
||||
vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
||||
vllm:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
||||
vllm:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 13.0
|
||||
vllm:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 97.0
|
||||
vllm:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 123.0
|
||||
vllm:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 138.0
|
||||
vllm:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
|
||||
vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
|
||||
```
|
||||
|
||||
Note - the choice of histogram buckets to be most useful to users
|
||||
across a broad set of use cases is not straightforward and will
|
||||
require refinement over time.
|
||||
|
||||
### Cache Config Info
|
||||
|
||||
`prometheus_client` has support for [Info
|
||||
metrics](https://prometheus.github.io/client_python/instrumenting/info/)
|
||||
which are equivalent to a `Gauge` whose value is permanently set to 1,
|
||||
but exposes interesting key/value pair information via labels. This is
|
||||
used for information about an instance that does not change - so it
|
||||
only needs to be observed at startup - and allows comparing across
|
||||
instances in Prometheus.
|
||||
|
||||
We use this concept for the `vllm:cache_config_info` metric:
|
||||
|
||||
```
|
||||
# HELP vllm:cache_config_info Information of the LLMEngine CacheConfig
|
||||
# TYPE vllm:cache_config_info gauge
|
||||
vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="False",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.9",...} 1.0
|
||||
|
||||
```
|
||||
|
||||
However, `prometheus_client` has [never supported Info metrics in
|
||||
multiprocessing
|
||||
mode](https://github.com/prometheus/client_python/pull/300) - for
|
||||
[unclear
|
||||
reasons](gh-pr:7279#discussion_r1710417152). We
|
||||
simply use a `Gauge` metric set to 1 and
|
||||
`multiprocess_mode="mostrecent"` instead.
|
||||
|
||||
### LoRA Metrics
|
||||
|
||||
The `vllm:lora_requests_info` `Gauge` is somewhat similar, except the
|
||||
value is the current wall-clock time, and is updated every iteration.
|
||||
|
||||
The label names used are:
|
||||
|
||||
- `running_lora_adapters`: a per-adapter count of the number requests
|
||||
running using that adapter, formatted as a comma-separated string.
|
||||
- `waiting_lora_adapters`: similar, except counting requests that are
|
||||
waiting to be scheduled.
|
||||
- `max_lora` - the static "max number of LoRAs in a single batch."
|
||||
configuration.
|
||||
|
||||
Encoding a running/waiting counts for multiple adapters in a
|
||||
comma-separated string seems quite misguided - we could use labels to
|
||||
distinguish between per-adapter counts. This should be revisited.
|
||||
|
||||
Note that `multiprocess_mode="livemostrecent"` is used - the most
|
||||
recent metric is used, but only from currently running processes.
|
||||
|
||||
This was added in
|
||||
<gh-pr:9477> and there is
|
||||
[at least one known
|
||||
user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54). If
|
||||
we revisit this design and deprecate the old metric, we should reduce
|
||||
the need for a significant deprecation period by making the change in
|
||||
v0 also and asking this project to move to the new metric.
|
||||
|
||||
### Prefix Cache metrics
|
||||
|
||||
The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
|
||||
some interesting points which may be relevant to how we approach
|
||||
future metrics.
|
||||
|
||||
Every time the prefix cache is queried, we record the number of tokens
|
||||
queried and the number of queried tokens present in the cache
|
||||
(i.e. hits).
|
||||
|
||||
However, the metric of interest is the hit rate - i.e. the number of
|
||||
hits per query.
|
||||
|
||||
In the case of logging, we expect the user is best served by
|
||||
calculating the hit rate over a fixed number of the most recent
|
||||
queries (the interval is fixed to 1k most recent queries for now).
|
||||
|
||||
In the case of Prometheus though, we should take advantage of the
|
||||
time-series nature of Prometheus and allow the user to calculate the
|
||||
hit rate over an interval of their choosing. For example, a PromQL
|
||||
query to calculate the hit interval of the past 5 minutes:
|
||||
|
||||
```text
|
||||
rate(cache_query_hit[5m]) / rate(cache_query_total[5m])
|
||||
```
|
||||
|
||||
To achieve this, we should record the queries and hits as counters in
|
||||
Prometheus, rather than recording the hit rate as a gauge.
|
||||
|
||||
## Deprecated Metrics
|
||||
|
||||
### How To Deprecate
|
||||
|
||||
Deprecating metrics shouldn't be taken lightly. Users may not notice a
|
||||
metric has been deprecated, and may be quite inconvenienced when it is
|
||||
suddenly (from their perspective) when it is removed, even if there is
|
||||
an equivalent metric for them to use.
|
||||
|
||||
As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
|
||||
[deprecated](gh-pr:2764) (with a
|
||||
comment in the code),
|
||||
[removed](gh-pr:12383), and then
|
||||
[noticed by a
|
||||
user](gh-issue:13218).
|
||||
|
||||
In general:
|
||||
|
||||
1) We should be cautious about deprecating metrics, especially since
|
||||
it can be hard to predict the user impact.
|
||||
2) We should include a prominent deprecation notice in the help string
|
||||
that is included in the `/metrics' output.
|
||||
3) We should list deprecated metrics in user-facing documentation and
|
||||
release notes.
|
||||
4) We should consider hiding deprecated metrics behind a CLI argument
|
||||
in order to give administrators [an escape
|
||||
hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
|
||||
for some time before deleting them.
|
||||
|
||||
See the [deprecation policy](project:../../contributing/deprecation_policy.md) for
|
||||
the project-wide deprecation policy.
|
||||
|
||||
### Unimplemented - `vllm:tokens_total`
|
||||
|
||||
Added by <gh-pr:4464>, but apparently never implemented. This can just be
|
||||
removed.
|
||||
|
||||
### Duplicated - Queue Time
|
||||
|
||||
The `vllm:time_in_queue_requests` Histogram metric was added by
|
||||
<gh-pr:9659> and its calculation is:
|
||||
|
||||
```
|
||||
self.metrics.first_scheduled_time = now
|
||||
self.metrics.time_in_queue = now - self.metrics.arrival_time
|
||||
```
|
||||
|
||||
Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
|
||||
us with:
|
||||
|
||||
```
|
||||
if seq_group.is_finished():
|
||||
if (seq_group.metrics.first_scheduled_time is not None and
|
||||
seq_group.metrics.first_token_time is not None):
|
||||
time_queue_requests.append(
|
||||
seq_group.metrics.first_scheduled_time -
|
||||
seq_group.metrics.arrival_time)
|
||||
...
|
||||
if seq_group.metrics.time_in_queue is not None:
|
||||
time_in_queue_requests.append(
|
||||
seq_group.metrics.time_in_queue)
|
||||
```
|
||||
|
||||
This seems duplicative, and one of them should be removed. The latter
|
||||
is used by the Grafana dashboard, so we should deprecate or remove the
|
||||
former from v0.
|
||||
|
||||
### Prefix Cache Hit Rate
|
||||
|
||||
See above - we now expose 'queries' and 'hits' counters rather than a
|
||||
'hit rate' gauge.
|
||||
|
||||
### KV Cache Offloading
|
||||
|
||||
Two v0 metrics relate to a "swapped" preemption mode that is no
|
||||
longer relevant in v1:
|
||||
|
||||
- `vllm:num_requests_swapped`
|
||||
- `vllm:cpu_cache_usage_perc`
|
||||
|
||||
In this mode, when a request is preempted (e.g. to make room in KV
|
||||
cache to complete other requests), we swap kv cache blocks out to CPU
|
||||
memory. This is also known as "KV cache offloading" and is configured
|
||||
with `--swap-space` and `--preemption-mode`.
|
||||
|
||||
In v0, [vLLM has long supported beam
|
||||
search](gh-issue:6226). The
|
||||
SequenceGroup encapsulated the idea of N Sequences which
|
||||
all shared the same prompt kv blocks. This enabled KV cache block
|
||||
sharing between requests, and copy-on-write to do branching. CPU
|
||||
swapping was intended for these beam search like cases.
|
||||
|
||||
Later, the concept of prefix caching was introduced, which allowed KV
|
||||
cache blocks to be shared implicitly. This proved to be a better
|
||||
option than CPU swapping since blocks can be evicted slowly on demand
|
||||
and the part of the prompt that was evicted can be recomputed.
|
||||
|
||||
SequenceGroup was removed in V1, although a replacement will be
|
||||
required for "parallel sampling" (`n>1`). [Beam search was moved out of
|
||||
the core (in
|
||||
V0)](gh-issue:8306). There was a
|
||||
lot of complex code for a very uncommon feature.
|
||||
|
||||
In V1, with prefix caching being better (zero over head) and therefore
|
||||
on by default, the preemption and recompute strategy should work
|
||||
better.
|
||||
|
||||
## Future Work
|
||||
|
||||
### Parallel Sampling
|
||||
|
||||
Some v0 metrics are only relevant in the context of "parallel
|
||||
sampling". This is where the `n` parameter in a request is used to
|
||||
request multiple completions from the same prompt.
|
||||
|
||||
As part of adding parallel sampling support in <gh-pr:10980> we should
|
||||
also add these metrics.
|
||||
|
||||
- `vllm:request_params_n` (Histogram)
|
||||
|
||||
Observes the value of the 'n' parameter of every finished request.
|
||||
|
||||
- `vllm:request_max_num_generation_tokens` (Histogram)
|
||||
|
||||
Observes the maximum output length of all sequences in every finished
|
||||
sequence group. In the absence of parallel sampling, this is
|
||||
equivalent to `vllm:request_generation_tokens`.
|
||||
|
||||
### Speculative Decoding
|
||||
|
||||
Some v0 metrics are specific to "speculative decoding". This is where
|
||||
we generate candidate tokens using a faster, approximate method or
|
||||
model and then validate those tokens with the larger model.
|
||||
|
||||
- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
|
||||
- `vllm:spec_decode_efficiency` (Gauge)
|
||||
- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
||||
|
||||
There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
|
||||
seculative decoding to v1. Other techniques will follow. We should
|
||||
revisit the v0 metrics in this context.
|
||||
|
||||
Note - we should probably expose acceptance rate as separate accepted
|
||||
and draft counters, like we do for prefix caching hit rate. Efficiency
|
||||
likely also needs similar treatment.
|
||||
|
||||
### Autoscaling and Load-balancing
|
||||
|
||||
A common use case for our metrics is to support automated scaling of
|
||||
vLLM instances.
|
||||
|
||||
For related discussion from the [Kubernetes Serving Working
|
||||
Group](https://github.com/kubernetes/community/tree/master/wg-serving),
|
||||
see:
|
||||
|
||||
- [Standardizing Large Model Server Metrics in
|
||||
Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
|
||||
- [Benchmarking LLM Workloads for Performance Evaluation and
|
||||
Autoscaling in
|
||||
Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
|
||||
- [Inference
|
||||
Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
|
||||
- <gh-issue:5041> and <gh-pr:12726>.
|
||||
|
||||
This is a non-trivial topic. Consider this comment from Rob:
|
||||
|
||||
> I think this metric should focus on trying to estimate what the max
|
||||
> concurrency that will cause the average request length > queries per
|
||||
> second ... since this is really what will "saturate" the server.
|
||||
|
||||
A clear goal is that we should expose the metrics required to detect
|
||||
this saturation point, so administrators can implement auto-scaling
|
||||
rules based on those. However, in order to do so, we need to have a
|
||||
clear view on how an administrator (and automated monitoring system)
|
||||
should judge an instance as approaching saturation:
|
||||
|
||||
> To identify, what is the saturation point for model server compute
|
||||
> (the inflection point where we cannot get more throughput with a
|
||||
> higher request rate, but start to incur additional latency) so we
|
||||
> can autoscale effectively?
|
||||
|
||||
### Metric Naming
|
||||
|
||||
Our approach to naming metrics probably deserves to be revisited:
|
||||
|
||||
1. The use of colons in metric names seems contrary to ["colons are
|
||||
reserved for user defined recording
|
||||
rules"](https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels)
|
||||
2. Most of our metrics follow the convention of ending with units, but
|
||||
not all do.
|
||||
3. Some of our metric names end with `_total`:
|
||||
|
||||
```
|
||||
If there is a suffix of `_total` on the metric name, it will be removed. When
|
||||
exposing the time series for counter, a `_total` suffix will be added. This is
|
||||
for compatibility between OpenMetrics and the Prometheus text format, as OpenMetrics
|
||||
requires the `_total` suffix.
|
||||
```
|
||||
|
||||
### Adding More Metrics
|
||||
|
||||
There is no shortage of ideas for new metrics:
|
||||
|
||||
- Examples from other projects like
|
||||
[TGI](https://github.com/IBM/text-generation-inference?tab=readme-ov-file#metrics)
|
||||
- Proposals arising from specific use cases, like the Kubernetes
|
||||
auto-scaling topic above
|
||||
- Proposals that might arise out of standardisation efforts like
|
||||
[OpenTelemetry Semantic Conventions for Gen
|
||||
AI](https://github.com/open-telemetry/semantic-conventions/tree/main/docs/gen-ai).
|
||||
|
||||
We should be cautious in our approach to adding new metrics. While
|
||||
metrics are often relatively straightforward to add:
|
||||
|
||||
1. They can be difficult to remove - see the section on deprecation
|
||||
above.
|
||||
2. They can have a meaningful performance impact when enabled. And
|
||||
metrics are usually of very limited use unless they can be enabled
|
||||
by default and in production.
|
||||
3. They have an impact on development and maintenance of the
|
||||
project. Every metric added to v0 has made this v1 effort more
|
||||
time-consuming, and perhaps not all metrics justify this ongoing
|
||||
investment in their maintenance.
|
||||
|
||||
## Tracing - OpenTelemetry
|
||||
|
||||
Metrics provide an aggregated view over time of the system's
|
||||
performance and health. Tracing, on the other hand, tracks individual
|
||||
requests as they move through different services and components. Both
|
||||
fall under the more general heading of "Observability".
|
||||
|
||||
v0 has support for OpenTelemetry tracing:
|
||||
|
||||
- Added by <gh-pr:4687>
|
||||
- Configured with `--oltp-traces-endpoint` and
|
||||
`--collect-detailed-traces`
|
||||
- [OpenTelemetry blog
|
||||
post](https://opentelemetry.io/blog/2024/llm-observability/)
|
||||
- [User-facing
|
||||
docs](https://docs.vllm.ai/en/latest/getting_started/examples/opentelemetry.html)
|
||||
- [Blog
|
||||
post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
|
||||
- [IBM product
|
||||
docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
|
||||
|
||||
OpenTelemetry has a [Gen AI Working
|
||||
Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
|
||||
|
||||
Since metrics is a big enough topic on its own, we are going to tackle
|
||||
the topic of tracing in v1 separately.
|
||||
|
||||
### OpenTelemetry Model Forward vs Execute Time
|
||||
|
||||
In v0, we have the following two metrics:
|
||||
|
||||
- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
|
||||
in the model forward pass when this request was in the batch.
|
||||
- `vllm:model_execute_time_milliseconds` (Histogram) - The time spent
|
||||
in the model execute function. This will include model forward,
|
||||
block/sync across workers, cpu-gpu sync time and sampling time.
|
||||
|
||||
These metrics are only enabled when OpenTelemetry tracing is enabled
|
||||
and if `--collect-detailed-traces=all/model/worker` is used. The
|
||||
documentation for this option states:
|
||||
|
||||
> collect detailed traces for the specified "modules. This involves
|
||||
> use of possibly costly and or blocking operations and hence might
|
||||
> have a performance impact.
|
||||
|
||||
The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
|
||||
as:
|
||||
|
||||
```
|
||||
-> gen_ai.latency.time_in_scheduler: Double(0.017550230026245117)
|
||||
-> gen_ai.latency.time_in_model_forward: Double(3.151565277099609)
|
||||
-> gen_ai.latency.time_in_model_execute: Double(3.6468167304992676)
|
||||
```
|
||||
|
||||
We already have `inference_time` and `decode_time` metrics, so the
|
||||
question is whether there are sufficiently common use cases for the
|
||||
higher-resolution timings to justify the overhead.
|
||||
|
||||
Since we are going to treat the question of OpenTelemetry support
|
||||
separately, we will include these particular metrics under that topic.
|
||||
@@ -1,247 +0,0 @@
|
||||
# Automatic Prefix Caching
|
||||
|
||||
Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang).
|
||||
|
||||
While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block:
|
||||
|
||||
```text
|
||||
Block 1 Block 2 Block 3
|
||||
[A gentle breeze stirred] [the leaves as children] [laughed in the distance]
|
||||
Block 1: |<--- block tokens ---->|
|
||||
Block 2: |<------- prefix ------>| |<--- block tokens --->|
|
||||
Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
|
||||
```
|
||||
|
||||
In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are:
|
||||
|
||||
* Parent hash value: The hash value of the parent hash block.
|
||||
* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
|
||||
* Extra hashes: Other values required to make this block unique, such as LoRA IDs, multi-modality input hashes (see the example below), and cache salts to isolate caches in multi-tenant environments.
|
||||
|
||||
> **Note 1:** We only cache full blocks.
|
||||
|
||||
> **Note 2:** The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
|
||||
SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
|
||||
|
||||
**A hashing example with multi-modality inputs**
|
||||
In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
|
||||
|
||||
```text
|
||||
messages = [
|
||||
{"role": "user",
|
||||
"content": [
|
||||
{"type": "text",
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
{"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
]},
|
||||
]
|
||||
```
|
||||
|
||||
It will become the following prompt:
|
||||
|
||||
```text
|
||||
Prompt:
|
||||
<s>[INST]What's in this image?\n[IMG][/INST]
|
||||
|
||||
Tokenized prompt:
|
||||
[1, 3, 7493, 1681, 1294, 1593, 3937, 9551, 10, 4]
|
||||
|
||||
Prompt with placeholders (<P>):
|
||||
[1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <P>, <P>, ..., <P>, 4]
|
||||
```
|
||||
|
||||
As we can see, after the tokenization, the `[IMG]` will be replaced by a sequence of placeholder tokens, and these placeholders will be replaced by image embeddings during prefill. The challenge for prefix caching to support this case is we need to differentiate images from the placeholders. To address this problem, we encode the image hash generated by the frontend image processor. For example, the hash of the blocks in the above prompt would be (assuming block size 16, and we have 41 placeholder tokens):
|
||||
|
||||
```text
|
||||
Block 0
|
||||
Parent hash: None
|
||||
Token IDs: 1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <p>, ..., <p>
|
||||
Extra hash: <image hash>
|
||||
Block 1
|
||||
Parent hash: Block 0 hash
|
||||
Token IDs: <p>, ..., <p>
|
||||
Extra hash: <image hash>
|
||||
Block 2
|
||||
Parent hash: Block 1 hash
|
||||
Token IDs: <p>, ..., <p>
|
||||
Extra hash: <image hash>
|
||||
Block 3
|
||||
Parent hash: Block 2 hash
|
||||
Token IDs: <p>, ..., <p>, 4
|
||||
Extra hash: <image hash>
|
||||
```
|
||||
|
||||
In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow.
|
||||
|
||||
**Cache Isolation for Security**
|
||||
To improve privacy in shared environments, vLLM supports isolating prefix cache reuse through optional per-request salting. By including a `cache_salt` in the request, this value is injected into the hash of the first block, ensuring that only requests with the same salt can reuse cached KV blocks. This prevents timing-based attacks where an adversary could infer cached content by observing latency differences. This offers protection without compromising performance.
|
||||
|
||||
```json
|
||||
{
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Here is a document with details about the world series: ..."},
|
||||
{"role": "user", "content": "Who won the world series in 2020?"}
|
||||
],
|
||||
"cache_salt": "your-cache-salt"
|
||||
}
|
||||
```
|
||||
|
||||
With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
|
||||
|
||||
> **Note:** Cache isolation is not supported in engine V0.
|
||||
|
||||
## Data Structure
|
||||
|
||||
The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
|
||||
|
||||
```python
|
||||
class KVCacheBlock:
|
||||
# The block ID (immutable)
|
||||
block_id: int
|
||||
# The block hash (will be assigned when the block is full,
|
||||
# and will be reset when the block is evicted).
|
||||
block_hash: BlockHashType
|
||||
# The number of requests using this block now.
|
||||
ref_cnt: int
|
||||
|
||||
# The pointers to form a doubly linked list for the free queue.
|
||||
prev_free_block: Optional["KVCacheBlock"] = None
|
||||
next_free_block: Optional["KVCacheBlock"] = None
|
||||
```
|
||||
|
||||
There are two design points to highlight:
|
||||
|
||||
1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time.
|
||||
2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits:
|
||||
1. We could have O(1) complexity moving elements in the middle to the tail.
|
||||
2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements.
|
||||
|
||||
As a result, we will have the following components when the KV cache manager is initialized:
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/overview.png
|
||||
:alt: Component Overview
|
||||
:::
|
||||
|
||||
* Block Pool: A list of KVCacheBlock.
|
||||
* Free Block Queue: Only store the pointers of head and tail blocks for manipulations.
|
||||
* Cache blocks: Mapping from hash key to block IDs.
|
||||
* Request blocks: Mapping from request ID to allocated block IDs.
|
||||
|
||||
## Operations
|
||||
|
||||
### Block Allocation
|
||||
|
||||
**New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation:
|
||||
|
||||
1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up Cache Blocks.
|
||||
2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:
|
||||
1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
|
||||
2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasn’t used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration.
|
||||
3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
|
||||
4. If an allocated block is already full of tokens, we immediately add it to the Cache Block, so that the block can be reused by other requests in the same batch.
|
||||
|
||||
**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
|
||||
|
||||
1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:
|
||||
1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
|
||||
2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
|
||||
3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
|
||||
|
||||
**Duplicated blocks**
|
||||
Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3:
|
||||
|
||||
```text
|
||||
Prompt: [A, B, C, D, E, F]
|
||||
Output: [G, H, I]
|
||||
|
||||
Time 0:
|
||||
Tokens: [A, B, C, D, E, F, G]
|
||||
Block Table: [0 (ABCD), 1 (EFG)]
|
||||
Cache Blocks: 0
|
||||
Time 1:
|
||||
Tokens: [A, B, C, D, E, F, G, H]
|
||||
Block Table: [0 (ABCD), 1 (EFGH)]
|
||||
Cache Blocks: 0, 1
|
||||
Time 2:
|
||||
Tokens: [A, B, C, D, E, F, G, H, I]
|
||||
Block Table: [0 (ABCD), 1 (EFGH), 2 (I)]
|
||||
Cache Blocks: 0, 1
|
||||
```
|
||||
|
||||
Now block 0 and block 1 are cached, and we send the same request again (Request 2\) with greedy sampling, so that it will produce exactly the same outputs as the Request 1:
|
||||
|
||||
```text
|
||||
Prompt: [A, B, C, D, E, F]
|
||||
Output: [G, H, I]
|
||||
|
||||
Time 0:
|
||||
Tokens: [A, B, C, D, E, F, G]
|
||||
Block Table: [0 (ABCD), 3 (EFG)]
|
||||
Cache Blocks: 0, 1
|
||||
Time 1:
|
||||
Tokens: [A, B, C, D, E, F, G, H]
|
||||
Block Table: [0 (ABCD), 3 (EFGH)]
|
||||
Cache Blocks: 0, 1, 3
|
||||
```
|
||||
|
||||
As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. In v0, when detecting block 3 is duplicated, we free block 3 and let Request 2 use block 1 instead, so its block table becomes `[0, 1]` in Time 1. However, the block table in vLLM v1 is append-only, meaning that changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
|
||||
|
||||
### Free
|
||||
|
||||
When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/free.png
|
||||
:alt: Free Queue after Free a Request
|
||||
:::
|
||||
|
||||
### Eviction (LRU)
|
||||
|
||||
When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
|
||||
|
||||
1. Pop the block from the head of the free queue. This is the LRU block to be evicted.
|
||||
2. Remove the block ID from the Cache Block.
|
||||
3. Remove the block hash.
|
||||
|
||||
## Example
|
||||
|
||||
In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
|
||||
|
||||
**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/example-time-1.png
|
||||
:alt: Example Time 1
|
||||
:::
|
||||
|
||||
**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/example-time-3.png
|
||||
:alt: Example Time 3
|
||||
:::
|
||||
|
||||
**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/example-time-4.png
|
||||
:alt: Example Time 4
|
||||
:::
|
||||
|
||||
**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/example-time-5.png
|
||||
:alt: Example Time 5
|
||||
:::
|
||||
|
||||
**Time 6: Request 1 is finished and free.**
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/example-time-6.png
|
||||
:alt: Example Time 6
|
||||
:::
|
||||
|
||||
**Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/example-time-7.png
|
||||
:alt: Example Time 7
|
||||
:::
|
||||
@@ -1,149 +0,0 @@
|
||||
# vLLM's `torch.compile` integration
|
||||
|
||||
In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
|
||||
|
||||
Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
|
||||
|
||||
## Compilation Cache
|
||||
|
||||
In the very verbose logs, we can see:
|
||||
|
||||
```
|
||||
INFO 03-07 03:06:55 [backends.py:409] Using cache directory: ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0 for vLLM's torch.compile
|
||||
```
|
||||
|
||||
vLLM will take all the available factors into consideration, and decide a directory to store all the compilation artifact. This means, you can directly copy the whole `~/.cache/vllm/torch_compile_cache` directory in your deployment scenario to save a great amount of compilation time, and hence accelerating the starting time of the vLLM instance.
|
||||
|
||||
The factors considered include:
|
||||
|
||||
- All the related configs (see the `compute_hash` functions in the [config.py](gh-file:vllm/config.py))
|
||||
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
|
||||
- The model's forward function and the relevant functions called by the forward function (see below)
|
||||
|
||||
With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
|
||||
|
||||
A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all the compilation finishes before we serve any requests. No requests will trigger new compilations. Otherwise, the engine would be blocked on that request, and the response time will have unexpected spikes.
|
||||
|
||||
## Python Code Compilation
|
||||
|
||||
In the very verbose logs, we can see:
|
||||
|
||||
```
|
||||
DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
|
||||
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
|
||||
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
|
||||
|
||||
DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
|
||||
DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
|
||||
```
|
||||
|
||||
This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.
|
||||
|
||||
The result of the Dynamo compilation, is a new function stored in `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py`. Usually, this function unpacks tensors from the module, and then pass it to the traced computation graph. The computation graph is stored in `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py`.
|
||||
|
||||
## Computation Graph Processing
|
||||
|
||||
The computation graph has shape annotations for every tensor. The inputs are input ids, position ids, weights and buffers from the model, and the outputs are the final hidden states. Note that lm head projection and sampling operations are not considered in the graph.
|
||||
|
||||
Most of the inputs to the computation graph has static shape, since they are model weights and buffers, and will not change during the lifetime of the model. Only the input ids and position ids have symbolic shapes, i.e. the shape can change from batch to batch. However, they will share the same symbolic shapes. That is to say, the only changing size to the computation graph, is the batch size (number of tokens processed in the current forward pass).
|
||||
|
||||
The attention operation is complicated, and it needs to interact with kv caches, with complicated shapes. Fortunately, the output of the attention operation just share the same shape as the input query of the attention operation. Therefore, we wrap the whole attention operation into a PyTorch custom op `torch.ops.vllm.unified_attention_with_output`, so that Dynamo will not try to inspect any of the internal operations. This way, although attention operation is complicated, we can still capture the model's computation graph as a full-graph, from Dynamo's perspective.
|
||||
|
||||
The computation graph is further split into pieces, by the `splitting_ops` (usually this is the attention operation). Therefore, in the `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py` file, we can see lots of submodules, each submodule is a piece of graph after splitting:
|
||||
|
||||
- Attention operation itself is a submodule.
|
||||
- The part of computation graph, from one attention operation to the next attention operation, is a submodule.
|
||||
|
||||
Every submodule can be identified by its index, and will be processed individually.
|
||||
|
||||
## Computation Graph Compilation
|
||||
|
||||
In the very verbose logs, we can also see:
|
||||
|
||||
```
|
||||
DEBUG 03-07 03:52:37 [backends.py:134] store the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
|
||||
DEBUG 03-07 03:52:39 [backends.py:134] store the 1-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
|
||||
...
|
||||
DEBUG 03-07 03:52:45 [backends.py:134] store the 15-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
|
||||
DEBUG 03-07 03:52:45 [backends.py:134] store the 16-th graph for shape None from inductor via handle ('fvj3ccoi7m34f3dnr4itmu55mmun44l5xymwhrjlwisylsk7q6jy', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/tf/ctfftkglj7b4lcttq5cymx6cew372uoauupqn6ldsvpiucavqcjc.py')
|
||||
```
|
||||
|
||||
This means the first piece of computation graph (with shape `None` for symbolic shape) is compiled by Inductor (with a key `fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw`). The compiled kernel is stored in `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py`. You can open the file to see what is the code Inductor finally runs.
|
||||
|
||||
One more detail: you can see that the 1-th graph and the 15-th graph have the same key, while the 0-th graph and the 16-th graph are different. This is expected, since we split the graph by the attention op, we get 3 unique subgraphs:
|
||||
|
||||
- the first layer before attention
|
||||
- every middle layer, from one attention operation to the next attention operation
|
||||
- the final layer after attention
|
||||
|
||||
If we already have the cache directory (e.g. run the same code for the second time), we will see the following logs:
|
||||
|
||||
```
|
||||
DEBUG 03-07 04:00:45 [backends.py:86] Directly load the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
|
||||
```
|
||||
|
||||
This time, Inductor compilation is completely bypassed, and we will load from disk to read the compilation artifact we get from the last time.
|
||||
|
||||
The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
|
||||
|
||||
```
|
||||
vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
|
||||
```
|
||||
|
||||
Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
|
||||
|
||||
When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
|
||||
|
||||
```
|
||||
AUTOTUNE mm(8x2048, 2048x3072)
|
||||
triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
|
||||
triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
|
||||
triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
|
||||
mm 0.0160 ms 81.6%
|
||||
triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
|
||||
triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
|
||||
triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
|
||||
triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
|
||||
triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
|
||||
triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
|
||||
SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
|
||||
```
|
||||
|
||||
It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).
|
||||
|
||||
Unfortunately, because auto-tuning takes quite a long time (from seconds to minutes, depending on the model size and the batch size), even though it can be cached for later use, for the sake of user-friendliness, we turn it off by default. If you want to have max performance, it is recommended to try it, by compiling specific shapes.
|
||||
|
||||
## Cudagraph Capture
|
||||
|
||||
vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
|
||||
|
||||
The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.
|
||||
|
||||
The cudagraphs are captured and managed by the compiler backend, and replayed when the batch size has corresponding cudagraph captured. The caller of the model (model runner) only needs to make sure it manages the input buffers correctly. All of the intermediate buffers are managed automatically by the compiler backend.
|
||||
|
||||
By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
|
||||
|
||||
```
|
||||
vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
|
||||
```
|
||||
|
||||
Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
|
||||
|
||||
### Full Cudagraph capture
|
||||
|
||||
It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models. Enable this using `--compilation-config '{"full_cuda_graph": true}'`.
|
||||
|
||||
Currently only FlashAttention 3 is compatible, and only when cascade attention is disabled.
|
||||
@@ -1,28 +0,0 @@
|
||||
(automatic-prefix-caching)=
|
||||
|
||||
# Automatic Prefix Caching
|
||||
|
||||
## Introduction
|
||||
|
||||
Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
|
||||
|
||||
:::{note}
|
||||
Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
|
||||
:::
|
||||
|
||||
## Enabling APC in vLLM
|
||||
|
||||
Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
|
||||
|
||||
<gh-file:examples/offline_inference/automatic_prefix_caching.py>
|
||||
|
||||
## Example workloads
|
||||
|
||||
We describe two example workloads, where APC can provide huge performance benefit:
|
||||
|
||||
- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
|
||||
- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
|
||||
|
||||
## Limits
|
||||
|
||||
APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
|
||||
@@ -1,476 +0,0 @@
|
||||
(compatibility-matrix)=
|
||||
|
||||
# Compatibility Matrix
|
||||
|
||||
The tables below show mutually exclusive features and the support on some hardware.
|
||||
|
||||
The symbols used have the following meanings:
|
||||
|
||||
- ✅ = Full compatibility
|
||||
- 🟠 = Partial compatibility
|
||||
- ❌ = No compatibility
|
||||
|
||||
:::{note}
|
||||
Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
|
||||
:::
|
||||
|
||||
## Feature x Feature
|
||||
|
||||
:::{raw} html
|
||||
<style>
|
||||
/* Make smaller to try to improve readability */
|
||||
td {
|
||||
font-size: 0.8rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
th {
|
||||
text-align: center;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
</style>
|
||||
:::
|
||||
|
||||
:::{list-table}
|
||||
:header-rows: 1
|
||||
:stub-columns: 1
|
||||
:widths: auto
|
||||
:class: vertical-table-header
|
||||
|
||||
- * Feature
|
||||
* [CP](#chunked-prefill)
|
||||
* [APC](#automatic-prefix-caching)
|
||||
* [LoRA](#lora-adapter)
|
||||
* <abbr title="Prompt Adapter">prmpt adptr</abbr>
|
||||
* [SD](#spec-decode)
|
||||
* CUDA graph
|
||||
* <abbr title="Pooling Models">pooling</abbr>
|
||||
* <abbr title="Encoder-Decoder Models">enc-dec</abbr>
|
||||
* <abbr title="Logprobs">logP</abbr>
|
||||
* <abbr title="Prompt Logprobs">prmpt logP</abbr>
|
||||
* <abbr title="Async Output Processing">async output</abbr>
|
||||
* multi-step
|
||||
* <abbr title="Multimodal Inputs">mm</abbr>
|
||||
* best-of
|
||||
* beam-search
|
||||
* <abbr title="Guided Decoding">guided dec</abbr>
|
||||
- * [CP](#chunked-prefill)
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * [APC](#automatic-prefix-caching)
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * [LoRA](#lora-adapter)
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * [SD](#spec-decode)
|
||||
* ✅
|
||||
* ✅
|
||||
* ❌
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * CUDA graph
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * <abbr title="Pooling Models">pooling</abbr>
|
||||
* ❌
|
||||
* ❌
|
||||
* ❌
|
||||
* ❌
|
||||
* ❌
|
||||
* ❌
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
|
||||
* ❌
|
||||
* [❌](gh-issue:7366)
|
||||
* ❌
|
||||
* ❌
|
||||
* [❌](gh-issue:7366)
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * <abbr title="Logprobs">logP</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❌
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❌
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * <abbr title="Async Output Processing">async output</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❌
|
||||
* ✅
|
||||
* ❌
|
||||
* ❌
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * multi-step
|
||||
* ❌
|
||||
* ✅
|
||||
* ❌
|
||||
* ✅
|
||||
* ❌
|
||||
* ✅
|
||||
* ❌
|
||||
* ❌
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
- * <abbr title="Multimodal Inputs">mm</abbr>
|
||||
* ✅
|
||||
* [🟠](gh-pr:8348)
|
||||
* [🟠](gh-pr:4194)
|
||||
* ❔
|
||||
* ❔
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❔
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
*
|
||||
- * best-of
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* [❌](gh-issue:6137)
|
||||
* ✅
|
||||
* ❌
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❔
|
||||
* [❌](gh-issue:7968)
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
*
|
||||
- * beam-search
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* [❌](gh-issue:6137)
|
||||
* ✅
|
||||
* ❌
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❔
|
||||
* [❌](gh-issue:7968)
|
||||
* ❔
|
||||
* ✅
|
||||
* ✅
|
||||
*
|
||||
- * <abbr title="Guided Decoding">guided dec</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ❔
|
||||
* ❔
|
||||
* [❌](gh-issue:11484)
|
||||
* ✅
|
||||
* ❌
|
||||
* ❔
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* [❌](gh-issue:9893)
|
||||
* ❔
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
:::
|
||||
|
||||
(feature-x-hardware)=
|
||||
|
||||
## Feature x Hardware
|
||||
|
||||
:::{list-table}
|
||||
:header-rows: 1
|
||||
:stub-columns: 1
|
||||
:widths: auto
|
||||
|
||||
- * Feature
|
||||
* Volta
|
||||
* Turing
|
||||
* Ampere
|
||||
* Ada
|
||||
* Hopper
|
||||
* CPU
|
||||
* AMD
|
||||
- * [CP](#chunked-prefill)
|
||||
* [❌](gh-issue:2729)
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
- * [APC](#automatic-prefix-caching)
|
||||
* [❌](gh-issue:3687)
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
- * [LoRA](#lora-adapter)
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* [❌](gh-issue:8475)
|
||||
* ✅
|
||||
- * [SD](#spec-decode)
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
- * CUDA graph
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❌
|
||||
* ✅
|
||||
- * <abbr title="Pooling Models">pooling</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❔
|
||||
- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❌
|
||||
- * <abbr title="Multimodal Inputs">mm</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
- * <abbr title="Logprobs">logP</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
- * <abbr title="Async Output Processing">async output</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ❌
|
||||
* ❌
|
||||
- * multi-step
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* [❌](gh-issue:8477)
|
||||
* ✅
|
||||
- * best-of
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
- * beam-search
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
- * <abbr title="Guided Decoding">guided dec</abbr>
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
:::
|
||||
@@ -1,68 +0,0 @@
|
||||
(disagg-prefill)=
|
||||
|
||||
# Disaggregated Prefilling (experimental)
|
||||
|
||||
This page introduces you the disaggregated prefilling feature in vLLM.
|
||||
|
||||
:::{note}
|
||||
This feature is experimental and subject to change.
|
||||
:::
|
||||
|
||||
## Why disaggregated prefilling?
|
||||
|
||||
Two main reasons:
|
||||
|
||||
- **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
|
||||
- **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
|
||||
|
||||
:::{note}
|
||||
Disaggregated prefill DOES NOT improve throughput.
|
||||
:::
|
||||
|
||||
## Usage example
|
||||
|
||||
Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
|
||||
|
||||
## Development
|
||||
|
||||
We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
|
||||
|
||||
All disaggregated prefilling implementation is under `vllm/distributed/kv_transfer`.
|
||||
|
||||
Key abstractions for disaggregated prefilling:
|
||||
|
||||
- **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
|
||||
- **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
|
||||
- **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
|
||||
|
||||
:::{note}
|
||||
`insert` is non-blocking operation but `drop_select` is blocking operation.
|
||||
:::
|
||||
|
||||
Here is a figure illustrating how the above 3 abstractions are organized:
|
||||
|
||||
:::{image} /assets/features/disagg_prefill/abstraction.jpg
|
||||
:alt: Disaggregated prefilling abstractions
|
||||
:::
|
||||
|
||||
The workflow of disaggregated prefilling is as follows:
|
||||
|
||||
:::{image} /assets/features/disagg_prefill/overview.jpg
|
||||
:alt: Disaggregated prefilling workflow
|
||||
:::
|
||||
|
||||
The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
|
||||
|
||||
## Third-party contributions
|
||||
|
||||
Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
|
||||
|
||||
We recommend three ways of implementations:
|
||||
|
||||
- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
|
||||
- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
|
||||
- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
|
||||
@@ -1,270 +0,0 @@
|
||||
(lora-adapter)=
|
||||
|
||||
# LoRA Adapters
|
||||
|
||||
This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
|
||||
|
||||
LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`.
|
||||
|
||||
Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
|
||||
them locally with
|
||||
|
||||
```python
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
```
|
||||
|
||||
Then we instantiate the base model and pass in the `enable_lora=True` flag:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
|
||||
```
|
||||
|
||||
We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter
|
||||
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
|
||||
the third parameter is the path to the LoRA adapter.
|
||||
|
||||
```python
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=256,
|
||||
stop=["[/assistant]"]
|
||||
)
|
||||
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
|
||||
]
|
||||
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
|
||||
)
|
||||
```
|
||||
|
||||
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
||||
|
||||
## Serving LoRA Adapters
|
||||
|
||||
LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
|
||||
`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
|
||||
|
||||
```bash
|
||||
vllm serve meta-llama/Llama-2-7b-hf \
|
||||
--enable-lora \
|
||||
--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
|
||||
```
|
||||
|
||||
:::{note}
|
||||
The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
|
||||
:::
|
||||
|
||||
The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
|
||||
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
|
||||
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
|
||||
|
||||
```bash
|
||||
curl localhost:8000/v1/models | jq .
|
||||
{
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": "meta-llama/Llama-2-7b-hf",
|
||||
"object": "model",
|
||||
...
|
||||
},
|
||||
{
|
||||
"id": "sql-lora",
|
||||
"object": "model",
|
||||
...
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
|
||||
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
|
||||
LoRA adapter requests if they were provided and `max_loras` is set high enough).
|
||||
|
||||
The following is an example request
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "sql-lora",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 7,
|
||||
"temperature": 0
|
||||
}' | jq
|
||||
```
|
||||
|
||||
## Dynamically serving LoRA Adapters
|
||||
|
||||
In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed.
|
||||
|
||||
Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
|
||||
|
||||
To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
|
||||
is set to `True`.
|
||||
|
||||
```bash
|
||||
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
|
||||
```
|
||||
|
||||
### Using API Endpoints
|
||||
Loading a LoRA Adapter:
|
||||
|
||||
To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
|
||||
details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
|
||||
|
||||
Example request to load a LoRA adapter:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/v1/load_lora_adapter \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"lora_name": "sql_adapter",
|
||||
"lora_path": "/path/to/sql-lora-adapter"
|
||||
}'
|
||||
```
|
||||
|
||||
Upon a successful request, the API will respond with a `200 OK` status code from `vllm serve`, and `curl` returns the response body: `Success: LoRA adapter 'sql_adapter' added successfully`. If an error occurs, such as if the adapter
|
||||
cannot be found or loaded, an appropriate error message will be returned.
|
||||
|
||||
Unloading a LoRA Adapter:
|
||||
|
||||
To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
|
||||
with the name or ID of the adapter to be unloaded.
|
||||
|
||||
Upon a successful request, the API responds with a `200 OK` status code from `vllm serve`, and `curl` returns the response body: `Success: LoRA adapter 'sql_adapter' removed successfully`.
|
||||
|
||||
Example request to unload a LoRA adapter:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/v1/unload_lora_adapter \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"lora_name": "sql_adapter"
|
||||
}'
|
||||
```
|
||||
|
||||
### Using Plugins
|
||||
Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
|
||||
|
||||
You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
|
||||
|
||||
You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory.](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
|
||||
To enable this resolver, set `VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True, set `VLLM_PLUGINS` to include `lora_filesystem_resolver`, and then set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
|
||||
it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and
|
||||
that adapter will then be available for normal use on the server.
|
||||
|
||||
Alternatively, follow these example steps to implement your own plugin:
|
||||
1. Implement the LoRAResolver interface.
|
||||
|
||||
Example of a simple S3 LoRAResolver implementation:
|
||||
|
||||
```python
|
||||
import os
|
||||
import s3fs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.resolver import LoRAResolver
|
||||
|
||||
class S3LoRAResolver(LoRAResolver):
|
||||
def __init__(self):
|
||||
self.s3 = s3fs.S3FileSystem()
|
||||
self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
|
||||
self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
|
||||
|
||||
async def resolve_lora(self, base_model_name, lora_name):
|
||||
s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
|
||||
local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
|
||||
|
||||
# Download the LoRA from S3 to the local path
|
||||
await self.s3._get(
|
||||
s3_path, local_path, recursive=True, maxdepth=1
|
||||
)
|
||||
|
||||
lora_request = LoRARequest(
|
||||
lora_name=lora_name,
|
||||
lora_path=local_path,
|
||||
lora_int_id=abs(hash(lora_name))
|
||||
)
|
||||
return lora_request
|
||||
```
|
||||
|
||||
2. Register LoRAResolver plugin.
|
||||
|
||||
```python
|
||||
from vllm.lora.resolver import LoRAResolverRegistry
|
||||
|
||||
s3_resolver = S3LoRAResolver()
|
||||
LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver)
|
||||
```
|
||||
|
||||
For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
|
||||
|
||||
## New format for `--lora-modules`
|
||||
|
||||
In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
|
||||
|
||||
```bash
|
||||
--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
|
||||
```
|
||||
|
||||
This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`.
|
||||
Now, you can specify a base_model_name alongside the name and path using JSON format. For example:
|
||||
|
||||
```bash
|
||||
--lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
|
||||
```
|
||||
|
||||
To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
|
||||
|
||||
## LoRA model lineage in model card
|
||||
|
||||
The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
|
||||
|
||||
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
|
||||
- The `root` field points to the artifact location of the lora adapter.
|
||||
|
||||
```bash
|
||||
$ curl http://localhost:8000/v1/models
|
||||
|
||||
{
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": "meta-llama/Llama-2-7b-hf",
|
||||
"object": "model",
|
||||
"created": 1715644056,
|
||||
"owned_by": "vllm",
|
||||
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
|
||||
"parent": null,
|
||||
"permission": [
|
||||
{
|
||||
.....
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "sql-lora",
|
||||
"object": "model",
|
||||
"created": 1715644056,
|
||||
"owned_by": "vllm",
|
||||
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
|
||||
"parent": meta-llama/Llama-2-7b-hf,
|
||||
"permission": [
|
||||
{
|
||||
....
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
@@ -1,541 +0,0 @@
|
||||
(multimodal-inputs)=
|
||||
|
||||
# Multimodal Inputs
|
||||
|
||||
This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
|
||||
|
||||
:::{note}
|
||||
We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
|
||||
and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
|
||||
:::
|
||||
|
||||
## Offline Inference
|
||||
|
||||
To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`:
|
||||
|
||||
- `prompt`: The prompt should follow the format that is documented on HuggingFace.
|
||||
- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`.
|
||||
|
||||
### Image Inputs
|
||||
|
||||
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
# Refer to the HuggingFace repo for the correct format to use
|
||||
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
||||
|
||||
# Load the image using PIL.Image
|
||||
image = PIL.Image.open(...)
|
||||
|
||||
# Single prompt inference
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {"image": image},
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
# Batch inference
|
||||
image_1 = PIL.Image.open(...)
|
||||
image_2 = PIL.Image.open(...)
|
||||
outputs = llm.generate(
|
||||
[
|
||||
{
|
||||
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
|
||||
"multi_modal_data": {"image": image_1},
|
||||
},
|
||||
{
|
||||
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
|
||||
"multi_modal_data": {"image": image_2},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||
|
||||
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
trust_remote_code=True, # Required to load Phi-3.5-vision
|
||||
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
|
||||
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
|
||||
)
|
||||
|
||||
# Refer to the HuggingFace repo for the correct format to use
|
||||
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
|
||||
|
||||
# Load the images using PIL.Image
|
||||
image1 = PIL.Image.open(...)
|
||||
image2 = PIL.Image.open(...)
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"image": [image1, image2]
|
||||
},
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
||||
|
||||
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# Specify the maximum number of frames per video to be 4. This can be changed.
|
||||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
||||
|
||||
# Create the request payload.
|
||||
video_frames = ... # load your video making sure it only has the number of frames specified earlier.
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
|
||||
],
|
||||
}
|
||||
for i in range(len(video_frames)):
|
||||
base64_image = encode_image(video_frames[i]) # base64 encoding.
|
||||
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
|
||||
message["content"].append(new_image)
|
||||
|
||||
# Perform inference and log output.
|
||||
outputs = llm.chat([message])
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
### Video Inputs
|
||||
|
||||
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
|
||||
instead of using multi-image input.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||
|
||||
### Audio Inputs
|
||||
|
||||
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/audio_language.py>
|
||||
|
||||
### Embedding Inputs
|
||||
|
||||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||||
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# Inference with image embeddings as input
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
# Refer to the HuggingFace repo for the correct format to use
|
||||
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
||||
|
||||
# Embeddings for single image
|
||||
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
|
||||
image_embeds = torch.load(...)
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {"image": image_embeds},
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
|
||||
|
||||
```python
|
||||
# Construct the prompt based on your model
|
||||
prompt = ...
|
||||
|
||||
# Embeddings for multiple images
|
||||
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
|
||||
image_embeds = torch.load(...)
|
||||
|
||||
# Qwen2-VL
|
||||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
||||
mm_data = {
|
||||
"image": {
|
||||
"image_embeds": image_embeds,
|
||||
# image_grid_thw is needed to calculate positional encoding.
|
||||
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
|
||||
}
|
||||
}
|
||||
|
||||
# MiniCPM-V
|
||||
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
|
||||
mm_data = {
|
||||
"image": {
|
||||
"image_embeds": image_embeds,
|
||||
# image_sizes is needed to calculate details of the sliced image.
|
||||
"image_sizes": [image.size for image in images], # list of image sizes
|
||||
}
|
||||
}
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": mm_data,
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
## Online Serving
|
||||
|
||||
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
|
||||
|
||||
:::{important}
|
||||
A chat template is **required** to use Chat Completions API.
|
||||
For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
|
||||
|
||||
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
|
||||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||||
|
||||
For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
|
||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
|
||||
:::
|
||||
|
||||
### Image Inputs
|
||||
|
||||
Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision).
|
||||
Here is a simple example using Phi-3.5-Vision.
|
||||
|
||||
First, launch the OpenAI-compatible server:
|
||||
|
||||
```bash
|
||||
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
|
||||
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
|
||||
```
|
||||
|
||||
Then, you can use the OpenAI client as follows:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
# Single-image input inference
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
chat_response = client.chat.completions.create(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||
# since the prompt will be processed automatically by the API server.
|
||||
{"type": "text", "text": "What’s in this image?"},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}],
|
||||
)
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
|
||||
# Multi-image input inference
|
||||
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
|
||||
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
|
||||
|
||||
chat_response = client.chat.completions.create(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What are the animals in these images?"},
|
||||
{"type": "image_url", "image_url": {"url": image_url_duck}},
|
||||
{"type": "image_url", "image_url": {"url": image_url_lion}},
|
||||
],
|
||||
}],
|
||||
)
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||
|
||||
:::{tip}
|
||||
Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
|
||||
and pass the file path as `url` in the API request.
|
||||
:::
|
||||
|
||||
:::{tip}
|
||||
There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
|
||||
In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
By default, the timeout for fetching images through HTTP URL is `5` seconds.
|
||||
You can override this by setting the environment variable:
|
||||
|
||||
```console
|
||||
export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
### Video Inputs
|
||||
|
||||
Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
|
||||
|
||||
First, launch the OpenAI-compatible server:
|
||||
|
||||
```bash
|
||||
vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
|
||||
```
|
||||
|
||||
Then, you can use the OpenAI client as follows:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||
|
||||
## Use video url in the payload
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
},
|
||||
],
|
||||
}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from image url:", result)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||
|
||||
:::{note}
|
||||
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
|
||||
You can override this by setting the environment variable:
|
||||
|
||||
```console
|
||||
export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
### Audio Inputs
|
||||
|
||||
Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
|
||||
Here is a simple example using Ultravox-v0.5-1B.
|
||||
|
||||
First, launch the OpenAI-compatible server:
|
||||
|
||||
```bash
|
||||
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
|
||||
```
|
||||
|
||||
Then, you can use the OpenAI client as follows:
|
||||
|
||||
```python
|
||||
import base64
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
def encode_base64_content_from_url(content_url: str) -> str:
|
||||
"""Encode a content retrieved from a remote url to base64 format."""
|
||||
|
||||
with requests.get(content_url) as response:
|
||||
response.raise_for_status()
|
||||
result = base64.b64encode(response.content).decode('utf-8')
|
||||
|
||||
return result
|
||||
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
# Any format supported by librosa is supported
|
||||
audio_url = AudioAsset("winning_call").url
|
||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?"
|
||||
},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64,
|
||||
"format": "wav"
|
||||
},
|
||||
},
|
||||
],
|
||||
}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from input audio:", result)
|
||||
```
|
||||
|
||||
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
|
||||
|
||||
```python
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
},
|
||||
},
|
||||
],
|
||||
}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from audio url:", result)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||
|
||||
:::{note}
|
||||
By default, the timeout for fetching audios through HTTP URL is `10` seconds.
|
||||
You can override this by setting the environment variable:
|
||||
|
||||
```console
|
||||
export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
### Embedding Inputs
|
||||
|
||||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||||
pass a tensor of shape to the corresponding field of the multi-modal dictionary.
|
||||
#### Image Embedding Inputs
|
||||
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
|
||||
The following example demonstrates how to pass image embeddings to the OpenAI server:
|
||||
|
||||
```python
|
||||
image_embedding = torch.load(...)
|
||||
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
|
||||
buffer = io.BytesIO()
|
||||
torch.save(image_embedding, buffer)
|
||||
buffer.seek(0)
|
||||
binary_data = buffer.read()
|
||||
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||||
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
# Basic usage - this is equivalent to the LLaVA example for offline inference
|
||||
model = "llava-hf/llava-1.5-7b-hf"
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": f"{base64_image_embedding}"
|
||||
}
|
||||
|
||||
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
|
||||
model = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
},
|
||||
}
|
||||
model = "openbmb/MiniCPM-V-2_6"
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
|
||||
},
|
||||
}
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?",
|
||||
},
|
||||
embeds,
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
)
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Only one message can contain `{"type": "image_embeds"}`.
|
||||
If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
|
||||
:::
|
||||