274 lines
13 KiB
Python
274 lines
13 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
from dataclasses import field
|
|
from typing import ClassVar, Literal
|
|
|
|
from pydantic import Field, SkipValidation, field_validator, model_validator
|
|
|
|
from vllm.config.utils import config
|
|
from vllm.logger import init_logger
|
|
from vllm.utils.torch_utils import is_quantized_kv_cache
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
CacheDType = Literal[
|
|
"auto",
|
|
"float16",
|
|
"bfloat16",
|
|
"fp8",
|
|
"fp8_e4m3",
|
|
"fp8_e5m2",
|
|
"fp8_inc",
|
|
"fp8_ds_mla",
|
|
]
|
|
MambaDType = Literal["auto", "float32", "float16"]
|
|
MambaCacheMode = Literal["all", "align", "none"]
|
|
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
|
|
KVOffloadingBackend = Literal["native", "lmcache"]
|
|
|
|
|
|
@config
|
|
class CacheConfig:
|
|
"""Configuration for the KV cache."""
|
|
|
|
DEFAULT_BLOCK_SIZE: ClassVar[int] = 16
|
|
|
|
block_size: SkipValidation[int] = None # type: ignore[assignment]
|
|
"""Size of a contiguous cache block in number of tokens.
|
|
Accepts None (meaning "use default"). After construction, always int."""
|
|
user_specified_block_size: bool = field(default=False, init=False)
|
|
"""Whether block_size was explicitly provided. Derived automatically."""
|
|
user_specified_mamba_block_size: bool = field(default=False, init=False)
|
|
"""Whether mamba_block_size was explicitly provided. Derived automatically."""
|
|
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
|
|
"""The fraction of GPU memory to be used for the model executor, which can
|
|
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
|
|
utilization. If unspecified, will use the default value of 0.9. This is a
|
|
per-instance limit, and only applies to the current vLLM instance. It does
|
|
not matter if you have another vLLM instance running on the same GPU. For
|
|
example, if you have two vLLM instances running on the same GPU, you can
|
|
set the GPU memory utilization to 0.5 for each instance."""
|
|
cache_dtype: CacheDType = "auto"
|
|
"""Data type for kv cache storage. If "auto", will use model data type.
|
|
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
|
|
fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc).
|
|
Some models (namely DeepSeekV3.2) default to fp8, set to bfloat16 to use
|
|
bfloat16 instead, this is an invalid option for models that do not default
|
|
to fp8.
|
|
"""
|
|
is_attention_free: bool = False
|
|
"""Whether the model is attention-free. This is primarily set in
|
|
`ModelConfig` and that value should be manually duplicated here."""
|
|
num_gpu_blocks_override: int | None = None
|
|
"""Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
|
|
if specified. Does nothing if `None`. Used for testing preemption."""
|
|
sliding_window: int | None = None
|
|
"""Sliding window size for the KV cache. This is primarily set in
|
|
`ModelConfig` and that value should be manually duplicated here."""
|
|
enable_prefix_caching: bool = True
|
|
"""Whether to enable prefix caching."""
|
|
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
|
|
"""Set the hash algorithm for prefix caching:
|
|
|
|
- "sha256" uses Pickle for object serialization before hashing. This is the current
|
|
default, as SHA256 is the most secure choice to avoid potential hash collisions.
|
|
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
|
|
serializes objects using canonical CBOR and hashes them with SHA-256.
|
|
- "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
|
|
non-cryptographic hashing. Requires the optional ``xxhash`` package.
|
|
IMPORTANT: Use of a hashing algorithm that is not considered cryptographically
|
|
secure theoretically increases the risk of hash collisions, which can cause
|
|
undefined behavior or even leak private information in multi-tenant environments.
|
|
Even if collisions are still very unlikely, it is important to consider your
|
|
security risk tolerance against the performance benefits before turning this on.
|
|
- "xxhash_cbor" combines canonical CBOR serialization with xxHash for
|
|
reproducible hashing. Requires the optional ``xxhash`` package."""
|
|
calculate_kv_scales: bool = False
|
|
"""Deprecated: This option is deprecated and will be removed in v0.19.
|
|
It enables dynamic calculation of `k_scale` and `v_scale` when
|
|
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
|
|
checkpoint if available. Otherwise, the scales will default to 1.0."""
|
|
kv_cache_dtype_skip_layers: list[str] = field(default_factory=list)
|
|
"""Layer patterns to skip KV cache quantization. Accepts layer indices
|
|
(e.g., '0', '2', '4') or attention type names (e.g., 'sliding_window')."""
|
|
cpu_kvcache_space_bytes: int | None = None
|
|
"""(CPU backend only) CPU key-value cache space."""
|
|
mamba_page_size_padded: int | None = None
|
|
""" Optional override for mamba page size; used by hybrid mamba/attention
|
|
models to ensure exact alignment with attention page size."""
|
|
mamba_block_size: int | None = Field(default=None, gt=0)
|
|
"""Size of a contiguous cache block in number of tokens for mamba cache.
|
|
Can be set only when prefix caching is enabled.
|
|
Value must be a multiple of 8 to align with causal_conv1d kernel."""
|
|
mamba_cache_dtype: MambaDType = "auto"
|
|
"""The data type to use for the Mamba cache (both the conv as well as the
|
|
ssm state). If set to 'auto', the data type will be inferred from the model
|
|
config."""
|
|
mamba_ssm_cache_dtype: MambaDType = "auto"
|
|
"""The data type to use for the Mamba cache (ssm state only, conv state will
|
|
still be controlled by mamba_cache_dtype). If set to 'auto', the data type
|
|
for the ssm state will be determined by mamba_cache_dtype."""
|
|
mamba_cache_mode: MambaCacheMode = "none"
|
|
"""The cache strategy for Mamba layers.
|
|
- "none": set when prefix caching is disabled.
|
|
- "all": cache the mamba state of all tokens at position i * block_size. This is
|
|
the default behavior (for models that support it) when prefix caching is
|
|
enabled.
|
|
- "align": only cache the mamba state of the last token of each scheduler step and
|
|
when the token is at position i * block_size.
|
|
"""
|
|
enable_mamba_cache_stochastic_rounding: bool = False
|
|
"""Enable stochastic rounding when writing SSM state to fp16 cache.
|
|
Uses random bits to unbias the rounding error, which can improve
|
|
numerical stability for long sequences."""
|
|
mamba_cache_philox_rounds: int = 0
|
|
"""Number of Philox PRNG rounds for stochastic rounding random number
|
|
generation. 0 uses the Triton default. Higher values improve randomness
|
|
quality at the cost of compute."""
|
|
|
|
# Will be set after profiling.
|
|
num_gpu_blocks: int | None = field(default=None, init=False)
|
|
"""The number of blocks to allocate for GPU memory."""
|
|
num_cpu_blocks: int | None = field(default=None, init=False)
|
|
"""The number of blocks to allocate for CPU memory."""
|
|
|
|
kv_sharing_fast_prefill: bool = False
|
|
"""This feature is work in progress and no prefill optimization takes place
|
|
with this flag enabled currently.
|
|
|
|
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
|
|
some layers can skip tokens corresponding to prefill. This flag enables
|
|
attention metadata for eligible layers to be overridden with metadata
|
|
necessary for implementing this optimization in some models (e.g. Gemma3n)
|
|
"""
|
|
|
|
kv_cache_memory_bytes: int | None = None
|
|
"""Size of KV Cache per GPU in bytes. By default, this is set to None
|
|
and vllm can automatically infer the kv cache size based on
|
|
gpu_memory_utilization. However, users may want to manually specify
|
|
the kv cache memory size. kv_cache_memory_bytes allows more fine-grain
|
|
control of how much memory gets used when compared with using
|
|
gpu_memory_utilization. Note that kv_cache_memory_bytes
|
|
(when not-None) ignores gpu_memory_utilization"""
|
|
|
|
kv_offloading_size: float | None = None
|
|
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
|
|
the total buffer size summed across all TP ranks. By default, this is set
|
|
to None, which means no KV offloading is enabled. When set, vLLM will
|
|
enable KV cache offloading to CPU using the kv_offloading_backend."""
|
|
|
|
kv_offloading_backend: KVOffloadingBackend = "native"
|
|
"""The backend to use for KV cache offloading. Supported backends include
|
|
'native' (vLLM native CPU offloading), 'lmcache'.
|
|
KV offloading is only activated when kv_offloading_size is set."""
|
|
|
|
def compute_hash(self) -> str:
|
|
"""
|
|
WARNING: Whenever a new field is added to this config,
|
|
ensure that it is included in the factors list if
|
|
it affects the computation graph.
|
|
|
|
Provide a hash that uniquely identifies all the configs
|
|
that affect the structure of the computation
|
|
graph from input ids/embeddings to the final hidden states,
|
|
excluding anything before input ids/embeddings and after
|
|
the final hidden states.
|
|
"""
|
|
ignored_factors = {
|
|
# Runtime/derived knobs that don't affect compiled graph shape
|
|
"gpu_memory_utilization",
|
|
"is_attention_free",
|
|
"num_gpu_blocks_override",
|
|
"enable_prefix_caching",
|
|
"prefix_caching_hash_algo",
|
|
"cpu_kvcache_space_bytes",
|
|
"mamba_page_size_padded",
|
|
"user_specified_block_size",
|
|
"user_specified_mamba_block_size",
|
|
"_block_size_resolved",
|
|
# Post-init/derived counters
|
|
"num_gpu_blocks",
|
|
"num_cpu_blocks",
|
|
# WIP feature toggle not impacting compiled graph shape
|
|
"kv_sharing_fast_prefill",
|
|
}
|
|
|
|
from vllm.config.utils import get_hash_factors, hash_factors
|
|
|
|
factors = get_hash_factors(self, ignored_factors)
|
|
return hash_factors(factors)
|
|
|
|
def metrics_info(self):
|
|
# convert cache_config to dict(key: str, value: str) for prometheus
|
|
# metrics info
|
|
return {key: str(value) for key, value in self.__dict__.items()}
|
|
|
|
_block_size_resolved: bool = field(default=False, init=False)
|
|
"""Guard against pydantic re-running _apply_block_size_default."""
|
|
|
|
@model_validator(mode="after")
|
|
def _apply_block_size_default(self) -> "CacheConfig":
|
|
# Pydantic re-runs validators when CacheConfig is nested inside
|
|
# another pydantic model (e.g. VllmConfig). Guard against that.
|
|
if self._block_size_resolved:
|
|
return self
|
|
object.__setattr__(self, "_block_size_resolved", True)
|
|
if self.block_size is None:
|
|
object.__setattr__(self, "block_size", self.DEFAULT_BLOCK_SIZE)
|
|
else:
|
|
object.__setattr__(self, "user_specified_block_size", True)
|
|
if self.mamba_block_size is not None:
|
|
object.__setattr__(self, "user_specified_mamba_block_size", True)
|
|
return self
|
|
|
|
@field_validator("calculate_kv_scales", mode="after")
|
|
@classmethod
|
|
def _warn_deprecated_calculate_kv_scales(cls, calculate_kv_scales: bool) -> bool:
|
|
if calculate_kv_scales:
|
|
logger.warning(
|
|
"The `--calculate-kv-scales` option is deprecated and will "
|
|
"be removed in v0.19. The scales will be loaded from the "
|
|
"model checkpoint if available, otherwise they default to "
|
|
"1.0."
|
|
)
|
|
return calculate_kv_scales
|
|
|
|
@field_validator("cache_dtype", mode="after")
|
|
@classmethod
|
|
def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
|
|
if is_quantized_kv_cache(cache_dtype):
|
|
logger.info(
|
|
"Using fp8 data type to store kv cache. It reduces the GPU "
|
|
"memory footprint and boosts the performance. "
|
|
"Meanwhile, it may cause accuracy drop without a proper "
|
|
"scaling factor."
|
|
)
|
|
return cache_dtype
|
|
|
|
def __post_init__(self):
|
|
if self.enable_mamba_cache_stochastic_rounding:
|
|
from vllm.platforms import current_platform
|
|
|
|
if not current_platform.is_cuda():
|
|
raise ValueError(
|
|
"Stochastic rounding for Mamba cache is only supported "
|
|
"on NVIDIA CUDA platforms. Please do not specify "
|
|
"`--enable-mamba-cache-stochastic-rounding`."
|
|
)
|
|
if not current_platform.is_device_capability_family(100):
|
|
raise ValueError(
|
|
"Stochastic rounding for Mamba cache requires compute "
|
|
"capability 10.0 (data center Blackwell). The `cvt.rs` PTX "
|
|
"instruction is not supported on your GPU. Please do not specify "
|
|
"`--enable-mamba-cache-stochastic-rounding`."
|
|
)
|
|
if self.mamba_ssm_cache_dtype != "float16":
|
|
raise ValueError(
|
|
"Stochastic rounding for Mamba cache requires "
|
|
"the SSM cache to be float16. Please set it explicitly, "
|
|
"by specifying `--mamba-ssm-cache-dtype float16`, or disable "
|
|
"stochastic rounding by not specifying "
|
|
"`--enable-mamba-cache-stochastic-rounding`."
|
|
)
|