[Misc] HF Hub LoRA Resolver (#20320)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2026-01-26 06:56:32 -07:00
parent 6ca2c91b96
commit 9ac818a551
8 changed files with 280 additions and 9 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1316,7 +1316,7 @@ steps:
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
-  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+  - pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins
 - label: Pipeline + Context Parallelism Test # 45min
  timeout_in_minutes: 60
--- a/docs/design/lora_resolver_plugins.md
+++ b/docs/design/lora_resolver_plugins.md
@@ -10,7 +10,7 @@ receives a request for a LoRA adapter that hasn't been loaded yet, the resolver
 to locate and load the adapter from their configured storage locations. This enables:
 - **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
+- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, while the built-in `hf_hub_resolver` will pull LoRA adapters from Huggingface Hub and proceed in an identical manner. In general, custom resolvers can be implemented to fetch from any source.
 - **Automatic Discovery**: Seamless integration with existing LoRA workflows
 - **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -159,10 +159,12 @@ Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adap
 You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
-You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory.](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
+You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory, as well as a resolver plugin to load LoRA adapters from repositories on Hugging Face Hub](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
-To enable this resolver, set `VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True, set `VLLM_PLUGINS` to include `lora_filesystem_resolver`, and then set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
+To enable either of these resolvers, you must `set VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True.
-it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and
+
-that adapter will then be available for normal use on the server.
+- To leverage a local directory, set `VLLM_PLUGINS` to include `lora_filesystem_resolver` and set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
 it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and that adapter will then be available for normal use on the server.
 - To leverage repositories on Hugging Face Hub, set `VLLM_PLUGINS` to include `lora_hf_hub_resolver` and set `VLLM_LORA_RESOLVER_HF_REPO_LIST` to a comma separated list of repository IDs on Hugging Face Hub. When vLLM receives a request for the LoRA adapter `my/repo/subpath`, it will download the adapter at the `subpath` of `my/repo` if it exists and contains an `adapter_config.json`, then build a request to the cached dir for the adapter, similar to the `lora_filesystem_resolver`. Please note that enabling remote downloads is insecure and not intended for use in production environments.
 Alternatively, follow these example steps to implement your own plugin:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ vllm = "vllm.entrypoints.cli.main:main"
 [project.entry-points."vllm.general_plugins"]
 lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
 lora_hf_hub_resolver = "vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver"
 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm
--- a/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
+++ b/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
@@ -0,0 +1,107 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import pytest
 from huggingface_hub.constants import HF_HUB_CACHE
 from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
 LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
 # Repo with multiple LoRAs contained in it
 LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
 LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora"  # noqa: E501
 NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
 LIB_DOWNLOAD_DIR = os.path.join(
    HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
 )
 INVALID_REPO_NAME = "thisrepodoesnotexist"
 # Repo with only one LoRA in the root dir
 LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
 LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
 REPO_DOWNLOAD_DIR = os.path.join(
    HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
 )
@pytest.mark.asyncio
 async def test_hf_resolver_with_direct_path():
    hf_resolver = HfHubResolver([LORA_REPO])
    assert hf_resolver is not None
    lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
    assert lora_request.lora_name == LORA_REPO
    assert REPO_DOWNLOAD_DIR in lora_request.lora_path
    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
 async def test_hf_resolver_with_nested_paths():
    hf_resolver = HfHubResolver([LORA_LIB])
    assert hf_resolver is not None
    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
    assert lora_request is not None
    assert lora_request.lora_name == LORA_NAME
    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
 async def test_hf_resolver_with_multiple_repos():
    hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
    assert hf_resolver is not None
    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
    assert lora_request is not None
    assert lora_request.lora_name == LORA_NAME
    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
 async def test_missing_adapter():
    hf_resolver = HfHubResolver([LORA_LIB])
    assert hf_resolver is not None
    missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
    assert missing_lora_request is None
@pytest.mark.asyncio
 async def test_nonlora_adapter():
    hf_resolver = HfHubResolver([LORA_LIB])
    assert hf_resolver is not None
    readme_request = await hf_resolver.resolve_lora(
        LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
    )
    assert readme_request is None
@pytest.mark.asyncio
 async def test_invalid_repo():
    hf_resolver = HfHubResolver([LORA_LIB])
    assert hf_resolver is not None
    invalid_repo_req = await hf_resolver.resolve_lora(
        INVALID_REPO_NAME,
        f"{INVALID_REPO_NAME}/foo",
    )
    assert invalid_repo_req is None
@pytest.mark.asyncio
 async def test_trailing_slash():
    hf_resolver = HfHubResolver([LORA_LIB])
    assert hf_resolver is not None
    lora_request = await hf_resolver.resolve_lora(
        LORA_LIB_MODEL_NAME,
        f"{LORA_NAME}/",
    )
    assert lora_request is not None
    assert lora_request.lora_name == f"{LORA_NAME}/"
    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -87,6 +87,7 @@ if TYPE_CHECKING:
    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
    VLLM_PLUGINS: list[str] | None = None
    VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
    VLLM_LORA_RESOLVER_HF_REPO_LIST: str | None = None
    # Deprecated env variables for profiling, kept for backward compatibility
    # See also vllm/config/profiler.py and `--profiler-config` argument
    VLLM_TORCH_CUDA_PROFILE: str | None = None
@@ -873,6 +874,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
        "VLLM_LORA_RESOLVER_CACHE_DIR", None
    ),
    # A remote HF repo(s) containing one or more LoRA adapters, which
    # may be downloaded and leveraged as needed. Only works if plugins
    # are enabled and VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
    # Values should be comma separated.
    "VLLM_LORA_RESOLVER_HF_REPO_LIST": lambda: os.getenv(
        "VLLM_LORA_RESOLVER_HF_REPO_LIST", None
    ),
    # Enables torch CUDA profiling if set to 1.
    # Deprecated, see profiler_config.
    "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),
--- a/vllm/plugins/lora_resolvers/filesystem_resolver.py
+++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py
@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
        self, base_model_name: str, lora_name: str
    ) -> LoRARequest | None:
        lora_path = os.path.join(self.lora_cache_dir, lora_name)
        maybe_lora_request = await self._get_lora_req_from_path(
            lora_name, lora_path, base_model_name
        )
        return maybe_lora_request
    async def _get_lora_req_from_path(
        self, lora_name: str, lora_path: str, base_model_name: str
    ) -> LoRARequest | None:
        """Builds a LoraRequest pointing to the lora path if it's a valid
        LoRA adapter and has a matching base_model_name.
        """
        if os.path.exists(lora_path):
-            adapter_config_path = os.path.join(
+            adapter_config_path = os.path.join(lora_path, "adapter_config.json")
-                self.lora_cache_dir, lora_name, "adapter_config.json"
+
            )
            if os.path.exists(adapter_config_path):
                with open(adapter_config_path) as file:
                    adapter_config = json.load(file)
--- a/vllm/plugins/lora_resolvers/hf_hub_resolver.py
+++ b/vllm/plugins/lora_resolvers/hf_hub_resolver.py
@@ -0,0 +1,143 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import os
 from huggingface_hub import HfApi, snapshot_download
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.lora.resolver import LoRAResolverRegistry
 from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
 logger = init_logger(__name__)
 class HfHubResolver(FilesystemResolver):
    def __init__(self, repo_list: list[str]):
        logger.warning(
            "LoRA is allowing resolution from the following repositories on"
            " HF Hub: %s please note that allowing remote downloads"
            " is not secure, and that this plugin is not intended for use in"
            " production environments.",
            repo_list,
        )
        self.repo_list: list[str] = repo_list
        self.adapter_dirs: dict[str, set[str]] = {}
    async def resolve_lora(
        self, base_model_name: str, lora_name: str
    ) -> LoRARequest | None:
        """Resolves potential LoRA requests in a remote repo on HF Hub.
        This is effectively the same behavior as the filesystem resolver, but
        with a snapshot_download on dirs containing an adapter config prior
        to inspecting the cached dir to build a potential LoRA
        request.
        """
        # If a LoRA name begins with the repository name, it's disambiguated
        maybe_repo = await self._resolve_repo(lora_name)
        # If we haven't inspected this repo before, save available adapter dirs
        if maybe_repo is not None and maybe_repo not in self.adapter_dirs:
            self.adapter_dirs[maybe_repo] = await self._get_adapter_dirs(maybe_repo)
        maybe_subpath = await self._resolve_repo_subpath(lora_name, maybe_repo)
        if maybe_repo is None or maybe_subpath is None:
            return None
        repo_path = await asyncio.to_thread(
            snapshot_download,
            repo_id=maybe_repo,
            allow_patterns=f"{maybe_subpath}/*" if maybe_subpath != "." else "*",
        )
        lora_path = os.path.join(repo_path, maybe_subpath)
        maybe_lora_request = await self._get_lora_req_from_path(
            lora_name, lora_path, base_model_name
        )
        return maybe_lora_request
    async def _resolve_repo(self, lora_name: str) -> str | None:
        """Given a fully qualified path to a LoRA with respect to its HF Hub
        repo, match the right repo to potentially download from if one exists.
        Args:
            lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
                match on <org>/<repo> (if it contains an adapter directly) or
                <org>/<repo>/ if it may have one in subdirs.
        """
        for potential_repo in self.repo_list:
            if lora_name.startswith(potential_repo) and (
                len(lora_name) == len(potential_repo)
                or lora_name[len(potential_repo)] == "/"
            ):
                return potential_repo
        return None
    async def _resolve_repo_subpath(
        self, lora_name: str, maybe_repo: str | None
    ) -> str | None:
        """Given the fully qualified path of the LoRA with respect to the HF
        Repo, get the subpath to download from assuming it's actually got an
        adapter in it.
        Args:
            lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
            maybe_repo: Path to the repo to match against if one exists.
        """
        if maybe_repo is None:
            return None
        repo_len = len(maybe_repo)
        if lora_name == maybe_repo or (
            len(lora_name) == repo_len + 1 and lora_name[-1] == "/"
        ):
            # Resolves to the root of the directory
            adapter_dir = "."
        else:
            # It's a subpath; removing trailing slashes if there are any
            adapter_dir = lora_name[repo_len + 1 :].rstrip("/")
        # Only download if the directory actually contains an adapter
        is_adapter = adapter_dir in self.adapter_dirs[maybe_repo]
        return adapter_dir if is_adapter else None
    async def _get_adapter_dirs(self, repo_name: str) -> set[str]:
        """Gets the subpaths within a HF repo that contain an adapter config.
        Args:
            repo_name: Name of the HF hub repo to inspect.
        """
        repo_files = await asyncio.to_thread(HfApi().list_repo_files, repo_id=repo_name)
        adapter_dirs = {
            os.path.dirname(name)
            for name in repo_files
            if name.endswith("adapter_config.json")
        }
        if "adapter_config.json" in repo_files:
            adapter_dirs.add(".")
        return adapter_dirs
 def register_hf_hub_resolver():
    """Register the Hf hub LoRA Resolver with vLLM"""
    hf_repo_list = envs.VLLM_LORA_RESOLVER_HF_REPO_LIST
    is_enabled = (
        envs.VLLM_PLUGINS is not None and "lora_hf_hub_resolver" in envs.VLLM_PLUGINS
    )
    if hf_repo_list:
        if not is_enabled:
            logger.warning(
                "It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
                "lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
                " enable this resolver directly in VLLM_PLUGINS to use it "
                " because it allows remote downloads."
            )
        else:
            hf_hub_resolver = HfHubResolver(hf_repo_list.split(","))
            LoRAResolverRegistry.register_resolver("Hf Hub Resolver", hf_hub_resolver)
    return