[Misc] HF Hub LoRA Resolver (#20320)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
This commit is contained in:
@@ -1316,7 +1316,7 @@ steps:
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins
|
||||
|
||||
- label: Pipeline + Context Parallelism Test # 45min
|
||||
timeout_in_minutes: 60
|
||||
|
||||
@@ -10,7 +10,7 @@ receives a request for a LoRA adapter that hasn't been loaded yet, the resolver
|
||||
to locate and load the adapter from their configured storage locations. This enables:
|
||||
|
||||
- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
|
||||
- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
|
||||
- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, while the built-in `hf_hub_resolver` will pull LoRA adapters from Huggingface Hub and proceed in an identical manner. In general, custom resolvers can be implemented to fetch from any source.
|
||||
- **Automatic Discovery**: Seamless integration with existing LoRA workflows
|
||||
- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
|
||||
|
||||
|
||||
@@ -159,10 +159,12 @@ Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adap
|
||||
|
||||
You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
|
||||
|
||||
You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory.](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
|
||||
To enable this resolver, set `VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True, set `VLLM_PLUGINS` to include `lora_filesystem_resolver`, and then set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
|
||||
it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and
|
||||
that adapter will then be available for normal use on the server.
|
||||
You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory, as well as a resolver plugin to load LoRA adapters from repositories on Hugging Face Hub](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
|
||||
To enable either of these resolvers, you must `set VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True.
|
||||
|
||||
- To leverage a local directory, set `VLLM_PLUGINS` to include `lora_filesystem_resolver` and set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
|
||||
it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and that adapter will then be available for normal use on the server.
|
||||
- To leverage repositories on Hugging Face Hub, set `VLLM_PLUGINS` to include `lora_hf_hub_resolver` and set `VLLM_LORA_RESOLVER_HF_REPO_LIST` to a comma separated list of repository IDs on Hugging Face Hub. When vLLM receives a request for the LoRA adapter `my/repo/subpath`, it will download the adapter at the `subpath` of `my/repo` if it exists and contains an `adapter_config.json`, then build a request to the cached dir for the adapter, similar to the `lora_filesystem_resolver`. Please note that enabling remote downloads is insecure and not intended for use in production environments.
|
||||
|
||||
Alternatively, follow these example steps to implement your own plugin:
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@ vllm = "vllm.entrypoints.cli.main:main"
|
||||
|
||||
[project.entry-points."vllm.general_plugins"]
|
||||
lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
|
||||
lora_hf_hub_resolver = "vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver"
|
||||
|
||||
[tool.setuptools_scm]
|
||||
# no extra settings needed, presence enables setuptools-scm
|
||||
|
||||
107
tests/plugins/lora_resolvers/test_hf_hub_resolver.py
Normal file
107
tests/plugins/lora_resolvers/test_hf_hub_resolver.py
Normal file
@@ -0,0 +1,107 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from huggingface_hub.constants import HF_HUB_CACHE
|
||||
|
||||
from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
|
||||
|
||||
LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
|
||||
# Repo with multiple LoRAs contained in it
|
||||
LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
|
||||
LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora" # noqa: E501
|
||||
NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
|
||||
LIB_DOWNLOAD_DIR = os.path.join(
|
||||
HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
|
||||
)
|
||||
INVALID_REPO_NAME = "thisrepodoesnotexist"
|
||||
|
||||
# Repo with only one LoRA in the root dir
|
||||
LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
|
||||
LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
|
||||
REPO_DOWNLOAD_DIR = os.path.join(
|
||||
HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hf_resolver_with_direct_path():
|
||||
hf_resolver = HfHubResolver([LORA_REPO])
|
||||
assert hf_resolver is not None
|
||||
|
||||
lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
|
||||
assert lora_request.lora_name == LORA_REPO
|
||||
assert REPO_DOWNLOAD_DIR in lora_request.lora_path
|
||||
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hf_resolver_with_nested_paths():
|
||||
hf_resolver = HfHubResolver([LORA_LIB])
|
||||
assert hf_resolver is not None
|
||||
|
||||
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
|
||||
assert lora_request is not None
|
||||
assert lora_request.lora_name == LORA_NAME
|
||||
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
|
||||
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hf_resolver_with_multiple_repos():
|
||||
hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
|
||||
assert hf_resolver is not None
|
||||
|
||||
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
|
||||
assert lora_request is not None
|
||||
assert lora_request.lora_name == LORA_NAME
|
||||
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
|
||||
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_adapter():
|
||||
hf_resolver = HfHubResolver([LORA_LIB])
|
||||
assert hf_resolver is not None
|
||||
|
||||
missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
|
||||
assert missing_lora_request is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nonlora_adapter():
|
||||
hf_resolver = HfHubResolver([LORA_LIB])
|
||||
assert hf_resolver is not None
|
||||
|
||||
readme_request = await hf_resolver.resolve_lora(
|
||||
LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
|
||||
)
|
||||
assert readme_request is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invalid_repo():
|
||||
hf_resolver = HfHubResolver([LORA_LIB])
|
||||
assert hf_resolver is not None
|
||||
|
||||
invalid_repo_req = await hf_resolver.resolve_lora(
|
||||
INVALID_REPO_NAME,
|
||||
f"{INVALID_REPO_NAME}/foo",
|
||||
)
|
||||
assert invalid_repo_req is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_trailing_slash():
|
||||
hf_resolver = HfHubResolver([LORA_LIB])
|
||||
assert hf_resolver is not None
|
||||
|
||||
lora_request = await hf_resolver.resolve_lora(
|
||||
LORA_LIB_MODEL_NAME,
|
||||
f"{LORA_NAME}/",
|
||||
)
|
||||
assert lora_request is not None
|
||||
assert lora_request.lora_name == f"{LORA_NAME}/"
|
||||
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
|
||||
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
|
||||
@@ -87,6 +87,7 @@ if TYPE_CHECKING:
|
||||
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
|
||||
VLLM_PLUGINS: list[str] | None = None
|
||||
VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
|
||||
VLLM_LORA_RESOLVER_HF_REPO_LIST: str | None = None
|
||||
# Deprecated env variables for profiling, kept for backward compatibility
|
||||
# See also vllm/config/profiler.py and `--profiler-config` argument
|
||||
VLLM_TORCH_CUDA_PROFILE: str | None = None
|
||||
@@ -873,6 +874,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
|
||||
"VLLM_LORA_RESOLVER_CACHE_DIR", None
|
||||
),
|
||||
# A remote HF repo(s) containing one or more LoRA adapters, which
|
||||
# may be downloaded and leveraged as needed. Only works if plugins
|
||||
# are enabled and VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
|
||||
# Values should be comma separated.
|
||||
"VLLM_LORA_RESOLVER_HF_REPO_LIST": lambda: os.getenv(
|
||||
"VLLM_LORA_RESOLVER_HF_REPO_LIST", None
|
||||
),
|
||||
# Enables torch CUDA profiling if set to 1.
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),
|
||||
|
||||
@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
|
||||
self, base_model_name: str, lora_name: str
|
||||
) -> LoRARequest | None:
|
||||
lora_path = os.path.join(self.lora_cache_dir, lora_name)
|
||||
if os.path.exists(lora_path):
|
||||
adapter_config_path = os.path.join(
|
||||
self.lora_cache_dir, lora_name, "adapter_config.json"
|
||||
maybe_lora_request = await self._get_lora_req_from_path(
|
||||
lora_name, lora_path, base_model_name
|
||||
)
|
||||
return maybe_lora_request
|
||||
|
||||
async def _get_lora_req_from_path(
|
||||
self, lora_name: str, lora_path: str, base_model_name: str
|
||||
) -> LoRARequest | None:
|
||||
"""Builds a LoraRequest pointing to the lora path if it's a valid
|
||||
LoRA adapter and has a matching base_model_name.
|
||||
"""
|
||||
if os.path.exists(lora_path):
|
||||
adapter_config_path = os.path.join(lora_path, "adapter_config.json")
|
||||
|
||||
if os.path.exists(adapter_config_path):
|
||||
with open(adapter_config_path) as file:
|
||||
adapter_config = json.load(file)
|
||||
|
||||
143
vllm/plugins/lora_resolvers/hf_hub_resolver.py
Normal file
143
vllm/plugins/lora_resolvers/hf_hub_resolver.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from huggingface_hub import HfApi, snapshot_download
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.resolver import LoRAResolverRegistry
|
||||
from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class HfHubResolver(FilesystemResolver):
|
||||
def __init__(self, repo_list: list[str]):
|
||||
logger.warning(
|
||||
"LoRA is allowing resolution from the following repositories on"
|
||||
" HF Hub: %s please note that allowing remote downloads"
|
||||
" is not secure, and that this plugin is not intended for use in"
|
||||
" production environments.",
|
||||
repo_list,
|
||||
)
|
||||
|
||||
self.repo_list: list[str] = repo_list
|
||||
self.adapter_dirs: dict[str, set[str]] = {}
|
||||
|
||||
async def resolve_lora(
|
||||
self, base_model_name: str, lora_name: str
|
||||
) -> LoRARequest | None:
|
||||
"""Resolves potential LoRA requests in a remote repo on HF Hub.
|
||||
This is effectively the same behavior as the filesystem resolver, but
|
||||
with a snapshot_download on dirs containing an adapter config prior
|
||||
to inspecting the cached dir to build a potential LoRA
|
||||
request.
|
||||
"""
|
||||
# If a LoRA name begins with the repository name, it's disambiguated
|
||||
maybe_repo = await self._resolve_repo(lora_name)
|
||||
|
||||
# If we haven't inspected this repo before, save available adapter dirs
|
||||
if maybe_repo is not None and maybe_repo not in self.adapter_dirs:
|
||||
self.adapter_dirs[maybe_repo] = await self._get_adapter_dirs(maybe_repo)
|
||||
|
||||
maybe_subpath = await self._resolve_repo_subpath(lora_name, maybe_repo)
|
||||
|
||||
if maybe_repo is None or maybe_subpath is None:
|
||||
return None
|
||||
|
||||
repo_path = await asyncio.to_thread(
|
||||
snapshot_download,
|
||||
repo_id=maybe_repo,
|
||||
allow_patterns=f"{maybe_subpath}/*" if maybe_subpath != "." else "*",
|
||||
)
|
||||
|
||||
lora_path = os.path.join(repo_path, maybe_subpath)
|
||||
maybe_lora_request = await self._get_lora_req_from_path(
|
||||
lora_name, lora_path, base_model_name
|
||||
)
|
||||
return maybe_lora_request
|
||||
|
||||
async def _resolve_repo(self, lora_name: str) -> str | None:
|
||||
"""Given a fully qualified path to a LoRA with respect to its HF Hub
|
||||
repo, match the right repo to potentially download from if one exists.
|
||||
|
||||
Args:
|
||||
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
|
||||
match on <org>/<repo> (if it contains an adapter directly) or
|
||||
<org>/<repo>/ if it may have one in subdirs.
|
||||
"""
|
||||
for potential_repo in self.repo_list:
|
||||
if lora_name.startswith(potential_repo) and (
|
||||
len(lora_name) == len(potential_repo)
|
||||
or lora_name[len(potential_repo)] == "/"
|
||||
):
|
||||
return potential_repo
|
||||
return None
|
||||
|
||||
async def _resolve_repo_subpath(
|
||||
self, lora_name: str, maybe_repo: str | None
|
||||
) -> str | None:
|
||||
"""Given the fully qualified path of the LoRA with respect to the HF
|
||||
Repo, get the subpath to download from assuming it's actually got an
|
||||
adapter in it.
|
||||
|
||||
Args:
|
||||
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
|
||||
maybe_repo: Path to the repo to match against if one exists.
|
||||
"""
|
||||
if maybe_repo is None:
|
||||
return None
|
||||
repo_len = len(maybe_repo)
|
||||
if lora_name == maybe_repo or (
|
||||
len(lora_name) == repo_len + 1 and lora_name[-1] == "/"
|
||||
):
|
||||
# Resolves to the root of the directory
|
||||
adapter_dir = "."
|
||||
else:
|
||||
# It's a subpath; removing trailing slashes if there are any
|
||||
adapter_dir = lora_name[repo_len + 1 :].rstrip("/")
|
||||
|
||||
# Only download if the directory actually contains an adapter
|
||||
is_adapter = adapter_dir in self.adapter_dirs[maybe_repo]
|
||||
return adapter_dir if is_adapter else None
|
||||
|
||||
async def _get_adapter_dirs(self, repo_name: str) -> set[str]:
|
||||
"""Gets the subpaths within a HF repo that contain an adapter config.
|
||||
|
||||
Args:
|
||||
repo_name: Name of the HF hub repo to inspect.
|
||||
"""
|
||||
repo_files = await asyncio.to_thread(HfApi().list_repo_files, repo_id=repo_name)
|
||||
adapter_dirs = {
|
||||
os.path.dirname(name)
|
||||
for name in repo_files
|
||||
if name.endswith("adapter_config.json")
|
||||
}
|
||||
if "adapter_config.json" in repo_files:
|
||||
adapter_dirs.add(".")
|
||||
return adapter_dirs
|
||||
|
||||
|
||||
def register_hf_hub_resolver():
|
||||
"""Register the Hf hub LoRA Resolver with vLLM"""
|
||||
|
||||
hf_repo_list = envs.VLLM_LORA_RESOLVER_HF_REPO_LIST
|
||||
is_enabled = (
|
||||
envs.VLLM_PLUGINS is not None and "lora_hf_hub_resolver" in envs.VLLM_PLUGINS
|
||||
)
|
||||
if hf_repo_list:
|
||||
if not is_enabled:
|
||||
logger.warning(
|
||||
"It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
|
||||
"lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
|
||||
" enable this resolver directly in VLLM_PLUGINS to use it "
|
||||
" because it allows remote downloads."
|
||||
)
|
||||
else:
|
||||
hf_hub_resolver = HfHubResolver(hf_repo_list.split(","))
|
||||
LoRAResolverRegistry.register_resolver("Hf Hub Resolver", hf_hub_resolver)
|
||||
|
||||
return
|
||||
Reference in New Issue
Block a user