Correct position of docstring of class attributes (#31209)
Signed-off-by: Weida Hong <wdhongtw@google.com>
This commit is contained in:
@@ -186,6 +186,7 @@ class DPMetadata:
|
|||||||
class ForwardContext:
|
class ForwardContext:
|
||||||
# copy from vllm_config.compilation_config.static_forward_context
|
# copy from vllm_config.compilation_config.static_forward_context
|
||||||
no_compile_layers: dict[str, Any]
|
no_compile_layers: dict[str, Any]
|
||||||
|
attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
|
||||||
"""
|
"""
|
||||||
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
|
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
|
||||||
attention layer to its attention metadata
|
attention layer to its attention metadata
|
||||||
@@ -193,7 +194,6 @@ class ForwardContext:
|
|||||||
for each microbatch.
|
for each microbatch.
|
||||||
Set dynamically for each forward pass
|
Set dynamically for each forward pass
|
||||||
"""
|
"""
|
||||||
attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
|
|
||||||
# TODO: remove after making all virtual_engines share the same kv cache
|
# TODO: remove after making all virtual_engines share the same kv cache
|
||||||
virtual_engine: int # set dynamically for each forward pass
|
virtual_engine: int # set dynamically for each forward pass
|
||||||
# set dynamically for each forward pass
|
# set dynamically for each forward pass
|
||||||
|
|||||||
@@ -80,17 +80,20 @@ class AttentionSpec(KVCacheSpec):
|
|||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class FullAttentionSpec(AttentionSpec):
|
class FullAttentionSpec(AttentionSpec):
|
||||||
sliding_window: int | None = None
|
|
||||||
attention_chunk_size: int | None = None
|
|
||||||
"""
|
"""
|
||||||
When hybrid allocator is disabled and the model contains both full
|
When hybrid allocator is disabled and the model contains both full
|
||||||
attention layers and sliding window attention layers, sliding
|
attention layers and sliding window attention layers, sliding
|
||||||
window attention are regarded as full attention in KV cache manager
|
window attention are regarded as full attention in KV cache manager
|
||||||
(blocks are allocated for all tokens), while computed as sliding window
|
(blocks are allocated for all tokens), while computed as sliding window
|
||||||
attention in model runner.
|
attention in model runner.
|
||||||
In this case, we use FullAttentionSpec and record the sliding window size.
|
In this case, we use FullAttentionSpec and record the sliding window size.
|
||||||
|
"""
|
||||||
|
|
||||||
|
sliding_window: int | None = None
|
||||||
|
"""
|
||||||
Default to None for not using sliding window attention.
|
Default to None for not using sliding window attention.
|
||||||
"""
|
"""
|
||||||
|
attention_chunk_size: int | None = None
|
||||||
|
|
||||||
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
|
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
|
||||||
max_model_len = vllm_config.model_config.max_model_len
|
max_model_len = vllm_config.model_config.max_model_len
|
||||||
@@ -390,10 +393,11 @@ class KVCacheConfig:
|
|||||||
The KV cache configuration of a model.
|
The KV cache configuration of a model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""The number of KV cache blocks"""
|
|
||||||
num_blocks: int
|
num_blocks: int
|
||||||
"""How should model runner initialize the KV cache tensors for each layer"""
|
"""The number of KV cache blocks"""
|
||||||
kv_cache_tensors: list[KVCacheTensor]
|
kv_cache_tensors: list[KVCacheTensor]
|
||||||
|
"""How should model runner initialize the KV cache tensors for each layer"""
|
||||||
|
kv_cache_groups: list[KVCacheGroupSpec]
|
||||||
"""
|
"""
|
||||||
The kv cache groups of the model.
|
The kv cache groups of the model.
|
||||||
For models with only one type of attention, there is only one group that
|
For models with only one type of attention, there is only one group that
|
||||||
@@ -401,4 +405,3 @@ class KVCacheConfig:
|
|||||||
For models with multiple types of attention, there will be multiple groups,
|
For models with multiple types of attention, there will be multiple groups,
|
||||||
see `_get_kv_cache_config_uniform_page_size` for more details.
|
see `_get_kv_cache_config_uniform_page_size` for more details.
|
||||||
"""
|
"""
|
||||||
kv_cache_groups: list[KVCacheGroupSpec]
|
|
||||||
|
|||||||
Reference in New Issue
Block a user