[Misc] Move LRUCache into its own file (#26342)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 23:08:40 +08:00
parent 6f59beaf0b
commit c0a7b89d8e
6 changed files with 349 additions and 378 deletions
--- a/tests/utils_/test_cache.py
+++ b/tests/utils_/test_cache.py
@@ -0,0 +1,125 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.utils.cache import CacheInfo, LRUCache
 class TestLRUCache(LRUCache):
    def _on_remove(self, key, value):
        if not hasattr(self, "_remove_counter"):
            self._remove_counter = 0
        self._remove_counter += 1
 def test_lru_cache():
    cache = TestLRUCache(3)
    assert cache.stat() == CacheInfo(hits=0, total=0)
    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
    cache.put(1, 1)
    assert len(cache) == 1
    cache.put(1, 1)
    assert len(cache) == 1
    cache.put(2, 2)
    assert len(cache) == 2
    cache.put(3, 3)
    assert len(cache) == 3
    assert set(cache.cache) == {1, 2, 3}
    cache.put(4, 4)
    assert len(cache) == 3
    assert set(cache.cache) == {2, 3, 4}
    assert cache._remove_counter == 1
    assert cache.get(2) == 2
    assert cache.stat() == CacheInfo(hits=1, total=1)
    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
    assert cache[2] == 2
    assert cache.stat() == CacheInfo(hits=2, total=2)
    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
    cache.put(5, 5)
    assert set(cache.cache) == {2, 4, 5}
    assert cache._remove_counter == 2
    assert cache.pop(5) == 5
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    assert cache.get(-1) is None
    assert cache.stat() == CacheInfo(hits=2, total=3)
    assert cache.stat(delta=True) == CacheInfo(hits=0, total=1)
    cache.pop(10)
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    cache.get(10)
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    cache.put(6, 6)
    assert len(cache) == 3
    assert set(cache.cache) == {2, 4, 6}
    assert 2 in cache
    assert 4 in cache
    assert 6 in cache
    cache.remove_oldest()
    assert len(cache) == 2
    assert set(cache.cache) == {2, 6}
    assert cache._remove_counter == 4
    cache.clear()
    assert len(cache) == 0
    assert cache._remove_counter == 6
    assert cache.stat() == CacheInfo(hits=0, total=0)
    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
    cache._remove_counter = 0
    cache[1] = 1
    assert len(cache) == 1
    cache[1] = 1
    assert len(cache) == 1
    cache[2] = 2
    assert len(cache) == 2
    cache[3] = 3
    assert len(cache) == 3
    assert set(cache.cache) == {1, 2, 3}
    cache[4] = 4
    assert len(cache) == 3
    assert set(cache.cache) == {2, 3, 4}
    assert cache._remove_counter == 1
    assert cache[2] == 2
    cache[5] = 5
    assert set(cache.cache) == {2, 4, 5}
    assert cache._remove_counter == 2
    del cache[5]
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    cache.pop(10)
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    cache[6] = 6
    assert len(cache) == 3
    assert set(cache.cache) == {2, 4, 6}
    assert 2 in cache
    assert 4 in cache
    assert 6 in cache
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -23,11 +23,8 @@ from vllm_test_utils.monitor import monitor
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
 # isort: off
 from vllm.utils import (
    CacheInfo,
    FlexibleArgumentParser,
    LRUCache,
    MemorySnapshot,
    PlaceholderModule,
    bind_kv_cache,
@@ -50,7 +47,6 @@ from vllm.utils import (
    unique_filepath,
 )
 # isort: on
 from ..utils import create_new_process_for_each_test, error_on_warning
@@ -557,128 +553,6 @@ def test_bind_kv_cache_pp():
        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
 class TestLRUCache(LRUCache):
    def _on_remove(self, key, value):
        if not hasattr(self, "_remove_counter"):
            self._remove_counter = 0
        self._remove_counter += 1
 def test_lru_cache():
    cache = TestLRUCache(3)
    assert cache.stat() == CacheInfo(hits=0, total=0)
    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
    cache.put(1, 1)
    assert len(cache) == 1
    cache.put(1, 1)
    assert len(cache) == 1
    cache.put(2, 2)
    assert len(cache) == 2
    cache.put(3, 3)
    assert len(cache) == 3
    assert set(cache.cache) == {1, 2, 3}
    cache.put(4, 4)
    assert len(cache) == 3
    assert set(cache.cache) == {2, 3, 4}
    assert cache._remove_counter == 1
    assert cache.get(2) == 2
    assert cache.stat() == CacheInfo(hits=1, total=1)
    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
    assert cache[2] == 2
    assert cache.stat() == CacheInfo(hits=2, total=2)
    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
    cache.put(5, 5)
    assert set(cache.cache) == {2, 4, 5}
    assert cache._remove_counter == 2
    assert cache.pop(5) == 5
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    assert cache.get(-1) is None
    assert cache.stat() == CacheInfo(hits=2, total=3)
    assert cache.stat(delta=True) == CacheInfo(hits=0, total=1)
    cache.pop(10)
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    cache.get(10)
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    cache.put(6, 6)
    assert len(cache) == 3
    assert set(cache.cache) == {2, 4, 6}
    assert 2 in cache
    assert 4 in cache
    assert 6 in cache
    cache.remove_oldest()
    assert len(cache) == 2
    assert set(cache.cache) == {2, 6}
    assert cache._remove_counter == 4
    cache.clear()
    assert len(cache) == 0
    assert cache._remove_counter == 6
    assert cache.stat() == CacheInfo(hits=0, total=0)
    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
    cache._remove_counter = 0
    cache[1] = 1
    assert len(cache) == 1
    cache[1] = 1
    assert len(cache) == 1
    cache[2] = 2
    assert len(cache) == 2
    cache[3] = 3
    assert len(cache) == 3
    assert set(cache.cache) == {1, 2, 3}
    cache[4] = 4
    assert len(cache) == 3
    assert set(cache.cache) == {2, 3, 4}
    assert cache._remove_counter == 1
    assert cache[2] == 2
    cache[5] = 5
    assert set(cache.cache) == {2, 4, 5}
    assert cache._remove_counter == 2
    del cache[5]
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    cache.pop(10)
    assert len(cache) == 2
    assert set(cache.cache) == {2, 4}
    assert cache._remove_counter == 3
    cache[6] = 6
    assert len(cache) == 3
    assert set(cache.cache) == {2, 4, 6}
    assert 2 in cache
    assert 4 in cache
    assert 6 in cache
@pytest.mark.parametrize(
    ("src_dtype", "tgt_dtype", "expected_result"),
    [
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -32,7 +32,8 @@ from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
 from vllm.model_executor.utils import get_packed_modules_mapping
-from vllm.utils import LRUCache, is_pin_memory_available
+from vllm.utils import is_pin_memory_available
 from vllm.utils.cache import LRUCache
 logger = init_logger(__name__)
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -17,7 +17,8 @@ from vllm.distributed.device_communicators.shm_object_storage import (
 )
 from vllm.envs import VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, LRUCache, MiB_bytes
+from vllm.utils import GiB_bytes, MiB_bytes
 from vllm.utils.cache import LRUCache
 from vllm.utils.jsontree import json_count_leaves, json_map_leaves, json_reduce_leaves
 from .inputs import (
--- a/vllm/utils/init.py
+++ b/vllm/utils/init.py
@@ -51,7 +51,6 @@ from collections.abc import (
    Hashable,
    Iterable,
    Iterator,
    KeysView,
    Mapping,
    Sequence,
 )
@@ -60,24 +59,19 @@ from concurrent.futures.process import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from pathlib import Path
 from types import MappingProxyType
 from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Generic,
    Literal,
    NamedTuple,
    TextIO,
    TypeVar,
    Union,
    cast,
    overload,
 )
 from urllib.parse import urlparse
 from uuid import uuid4
 import cachetools
 import cbor2
 import cloudpickle
 import numpy as np
@@ -183,13 +177,6 @@ U = TypeVar("U")
 _K = TypeVar("_K", bound=Hashable)
 _V = TypeVar("_V")
 _T = TypeVar("_T")
 class _Sentinel: ...
 ALL_PINNED_SENTINEL = _Sentinel()
 class Device(enum.Enum):
@@ -215,243 +202,6 @@ class Counter:
        self.counter = 0
 class _MappingOrderCacheView(UserDict[_K, _V]):
    def __init__(self, data: Mapping[_K, _V], ordered_keys: Mapping[_K, None]):
        super().__init__(data)
        self.ordered_keys = ordered_keys
    def __iter__(self) -> Iterator[_K]:
        return iter(self.ordered_keys)
    def keys(self) -> KeysView[_K]:
        return KeysView(self.ordered_keys)
 class CacheInfo(NamedTuple):
    hits: int
    total: int
    @property
    def hit_ratio(self) -> float:
        if self.total == 0:
            return 0
        return self.hits / self.total
    def __sub__(self, other: CacheInfo):
        return CacheInfo(
            hits=self.hits - other.hits,
            total=self.total - other.total,
        )
 class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
    def __init__(self, capacity: float, getsizeof: Callable[[_V], float] | None = None):
        super().__init__(capacity, getsizeof)
        self.pinned_items = set[_K]()
        self._hits = 0
        self._total = 0
        self._last_info = CacheInfo(hits=0, total=0)
    def __getitem__(self, key: _K, *, update_info: bool = True) -> _V:
        value = super().__getitem__(key)
        if update_info:
            self._hits += 1
            self._total += 1
        return value
    def __delitem__(self, key: _K) -> None:
        run_on_remove = key in self
        value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
        super().__delitem__(key)
        if key in self.pinned_items:
            # Todo: add warning to inform that del pinned item
            self._unpin(key)
        if run_on_remove:
            self._on_remove(key, value)
    @property
    def cache(self) -> Mapping[_K, _V]:
        """Return the internal cache dictionary in order (read-only)."""
        return _MappingOrderCacheView(
            self._Cache__data,  # type: ignore
            self.order,
        )
    @property
    def order(self) -> Mapping[_K, None]:
        """Return the internal order dictionary (read-only)."""
        return MappingProxyType(self._LRUCache__order)  # type: ignore
    @property
    def capacity(self) -> float:
        return self.maxsize
    @property
    def usage(self) -> float:
        if self.maxsize == 0:
            return 0
        return self.currsize / self.maxsize
    def stat(self, *, delta: bool = False) -> CacheInfo:
        """
        Gets the cumulative number of hits and queries against this cache.
        If `delta=True`, instead gets these statistics
        since the last call that also passed `delta=True`.
        """
        info = CacheInfo(hits=self._hits, total=self._total)
        if delta:
            info_delta = info - self._last_info
            self._last_info = info
            info = info_delta
        return info
    def touch(self, key: _K) -> None:
        try:
            self._LRUCache__order.move_to_end(key)  # type: ignore
        except KeyError:
            self._LRUCache__order[key] = None  # type: ignore
    @overload
    def get(self, key: _K, /) -> _V | None: ...
    @overload
    def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]: ...
    def get(
        self, key: _K, /, default: Union[_V, _T] | None = None
    ) -> Union[_V, _T] | None:
        value: Union[_V, _T] | None
        if key in self:
            value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
            self._hits += 1
        else:
            value = default
        self._total += 1
        return value
    @overload
    def pop(self, key: _K) -> _V: ...
    @overload
    def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]: ...
    def pop(
        self, key: _K, default: Union[_V, _T] | None = None
    ) -> Union[_V, _T] | None:
        value: Union[_V, _T] | None
        if key not in self:
            return default
        value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
        self.__delitem__(key)
        return value
    def put(self, key: _K, value: _V) -> None:
        self.__setitem__(key, value)
    def pin(self, key: _K) -> None:
        """
        Pins a key in the cache preventing it from being
        evicted in the LRU order.
        """
        if key not in self:
            raise ValueError(f"Cannot pin key: {key} not in cache.")
        self.pinned_items.add(key)
    def _unpin(self, key: _K) -> None:
        """
        Unpins a key in the cache allowing it to be
        evicted in the LRU order.
        """
        self.pinned_items.remove(key)
    def _on_remove(self, key: _K, value: _V | None) -> None:
        pass
    def remove_oldest(self, *, remove_pinned: bool = False) -> None:
        if len(self) == 0:
            return
        self.popitem(remove_pinned=remove_pinned)
    def _remove_old_if_needed(self) -> None:
        while self.currsize > self.capacity:
            self.remove_oldest()
    def popitem(self, remove_pinned: bool = False):
        """Remove and return the `(key, value)` pair least recently used."""
        if not remove_pinned:
            # pop the oldest item in the cache that is not pinned
            lru_key = next(
                (key for key in self.order if key not in self.pinned_items),
                ALL_PINNED_SENTINEL,
            )
            if lru_key is ALL_PINNED_SENTINEL:
                raise RuntimeError(
                    "All items are pinned, cannot remove oldest from the cache."
                )
        else:
            lru_key = next(iter(self.order))
        value = self.pop(cast(_K, lru_key))
        return (lru_key, value)
    def clear(self) -> None:
        while len(self) > 0:
            self.remove_oldest(remove_pinned=True)
        self._hits = 0
        self._total = 0
        self._last_info = CacheInfo(hits=0, total=0)
 class PyObjectCache:
    """Used to cache python objects to avoid object allocations
    across scheduler iterations.
    """
    def __init__(self, obj_builder):
        self._obj_builder = obj_builder
        self._index = 0
        self._obj_cache = []
        for _ in range(128):
            self._obj_cache.append(self._obj_builder())
    def _grow_cache(self):
        # Double the size of the cache
        num_objs = len(self._obj_cache)
        for _ in range(num_objs):
            self._obj_cache.append(self._obj_builder())
    def get_object(self):
        """Returns a pre-allocated cached object. If there is not enough
        objects, then the cache size will double.
        """
        if self._index >= len(self._obj_cache):
            self._grow_cache()
            assert self._index < len(self._obj_cache)
        obj = self._obj_cache[self._index]
        self._index += 1
        return obj
    def reset(self):
        """Makes all cached-objects available for the next scheduler iteration."""
        self._index = 0
@cache
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
    """Returns the maximum shared memory per thread block in bytes."""
--- a/vllm/utils/cache.py
+++ b/vllm/utils/cache.py
@@ -0,0 +1,220 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 from collections import UserDict
 from collections.abc import Hashable, Iterator, KeysView, Mapping
 from types import MappingProxyType
 from typing import Callable, Generic, NamedTuple, TypeVar, Union, cast, overload
 import cachetools
 _K = TypeVar("_K", bound=Hashable)
 _V = TypeVar("_V")
 _T = TypeVar("_T")
 class _Sentinel: ...
 ALL_PINNED_SENTINEL = _Sentinel()
 class _MappingOrderCacheView(UserDict[_K, _V]):
    def __init__(self, data: Mapping[_K, _V], ordered_keys: Mapping[_K, None]):
        super().__init__(data)
        self.ordered_keys = ordered_keys
    def __iter__(self) -> Iterator[_K]:
        return iter(self.ordered_keys)
    def keys(self) -> KeysView[_K]:
        return KeysView(self.ordered_keys)
 class CacheInfo(NamedTuple):
    hits: int
    total: int
    @property
    def hit_ratio(self) -> float:
        if self.total == 0:
            return 0
        return self.hits / self.total
    def __sub__(self, other: CacheInfo):
        return CacheInfo(
            hits=self.hits - other.hits,
            total=self.total - other.total,
        )
 class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
    def __init__(self, capacity: float, getsizeof: Callable[[_V], float] | None = None):
        super().__init__(capacity, getsizeof)
        self.pinned_items = set[_K]()
        self._hits = 0
        self._total = 0
        self._last_info = CacheInfo(hits=0, total=0)
    def __getitem__(self, key: _K, *, update_info: bool = True) -> _V:
        value = super().__getitem__(key)
        if update_info:
            self._hits += 1
            self._total += 1
        return value
    def __delitem__(self, key: _K) -> None:
        run_on_remove = key in self
        value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
        super().__delitem__(key)
        if key in self.pinned_items:
            # Todo: add warning to inform that del pinned item
            self._unpin(key)
        if run_on_remove:
            self._on_remove(key, value)
    @property
    def cache(self) -> Mapping[_K, _V]:
        """Return the internal cache dictionary in order (read-only)."""
        return _MappingOrderCacheView(
            self._Cache__data,  # type: ignore
            self.order,
        )
    @property
    def order(self) -> Mapping[_K, None]:
        """Return the internal order dictionary (read-only)."""
        return MappingProxyType(self._LRUCache__order)  # type: ignore
    @property
    def capacity(self) -> float:
        return self.maxsize
    @property
    def usage(self) -> float:
        if self.maxsize == 0:
            return 0
        return self.currsize / self.maxsize
    def stat(self, *, delta: bool = False) -> CacheInfo:
        """
        Gets the cumulative number of hits and queries against this cache.
        If `delta=True`, instead gets these statistics
        since the last call that also passed `delta=True`.
        """
        info = CacheInfo(hits=self._hits, total=self._total)
        if delta:
            info_delta = info - self._last_info
            self._last_info = info
            info = info_delta
        return info
    def touch(self, key: _K) -> None:
        try:
            self._LRUCache__order.move_to_end(key)  # type: ignore
        except KeyError:
            self._LRUCache__order[key] = None  # type: ignore
    @overload
    def get(self, key: _K, /) -> _V | None: ...
    @overload
    def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]: ...
    def get(
        self, key: _K, /, default: Union[_V, _T] | None = None
    ) -> Union[_V, _T] | None:
        value: Union[_V, _T] | None
        if key in self:
            value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
            self._hits += 1
        else:
            value = default
        self._total += 1
        return value
    @overload
    def pop(self, key: _K) -> _V: ...
    @overload
    def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]: ...
    def pop(
        self, key: _K, default: Union[_V, _T] | None = None
    ) -> Union[_V, _T] | None:
        value: Union[_V, _T] | None
        if key not in self:
            return default
        value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
        self.__delitem__(key)
        return value
    def put(self, key: _K, value: _V) -> None:
        self.__setitem__(key, value)
    def pin(self, key: _K) -> None:
        """
        Pins a key in the cache preventing it from being
        evicted in the LRU order.
        """
        if key not in self:
            raise ValueError(f"Cannot pin key: {key} not in cache.")
        self.pinned_items.add(key)
    def _unpin(self, key: _K) -> None:
        """
        Unpins a key in the cache allowing it to be
        evicted in the LRU order.
        """
        self.pinned_items.remove(key)
    def _on_remove(self, key: _K, value: _V | None) -> None:
        pass
    def remove_oldest(self, *, remove_pinned: bool = False) -> None:
        if len(self) == 0:
            return
        self.popitem(remove_pinned=remove_pinned)
    def _remove_old_if_needed(self) -> None:
        while self.currsize > self.capacity:
            self.remove_oldest()
    def popitem(self, remove_pinned: bool = False):
        """Remove and return the `(key, value)` pair least recently used."""
        if not remove_pinned:
            # pop the oldest item in the cache that is not pinned
            lru_key = next(
                (key for key in self.order if key not in self.pinned_items),
                ALL_PINNED_SENTINEL,
            )
            if lru_key is ALL_PINNED_SENTINEL:
                raise RuntimeError(
                    "All items are pinned, cannot remove oldest from the cache."
                )
        else:
            lru_key = next(iter(self.order))
        value = self.pop(cast(_K, lru_key))
        return (lru_key, value)
    def clear(self) -> None:
        while len(self) > 0:
            self.remove_oldest(remove_pinned=True)
        self._hits = 0
        self._total = 0
        self._last_info = CacheInfo(hits=0, total=0)