Correct position of docstring of class attributes (#31209)

Signed-off-by: Weida Hong <wdhongtw@google.com>
This commit is contained in:
Weida Hong
2025-12-23 18:08:58 +08:00
committed by GitHub
parent f32cfd7d97
commit 73cfb7a722
2 changed files with 13 additions and 10 deletions

View File

@@ -186,6 +186,7 @@ class DPMetadata:
class ForwardContext: class ForwardContext:
# copy from vllm_config.compilation_config.static_forward_context # copy from vllm_config.compilation_config.static_forward_context
no_compile_layers: dict[str, Any] no_compile_layers: dict[str, Any]
attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
""" """
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
attention layer to its attention metadata attention layer to its attention metadata
@@ -193,7 +194,6 @@ class ForwardContext:
for each microbatch. for each microbatch.
Set dynamically for each forward pass Set dynamically for each forward pass
""" """
attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
# TODO: remove after making all virtual_engines share the same kv cache # TODO: remove after making all virtual_engines share the same kv cache
virtual_engine: int # set dynamically for each forward pass virtual_engine: int # set dynamically for each forward pass
# set dynamically for each forward pass # set dynamically for each forward pass

View File

@@ -80,8 +80,6 @@ class AttentionSpec(KVCacheSpec):
@dataclass(frozen=True) @dataclass(frozen=True)
class FullAttentionSpec(AttentionSpec): class FullAttentionSpec(AttentionSpec):
sliding_window: int | None = None
attention_chunk_size: int | None = None
""" """
When hybrid allocator is disabled and the model contains both full When hybrid allocator is disabled and the model contains both full
attention layers and sliding window attention layers, sliding attention layers and sliding window attention layers, sliding
@@ -89,8 +87,13 @@ class FullAttentionSpec(AttentionSpec):
(blocks are allocated for all tokens), while computed as sliding window (blocks are allocated for all tokens), while computed as sliding window
attention in model runner. attention in model runner.
In this case, we use FullAttentionSpec and record the sliding window size. In this case, we use FullAttentionSpec and record the sliding window size.
"""
sliding_window: int | None = None
"""
Default to None for not using sliding window attention. Default to None for not using sliding window attention.
""" """
attention_chunk_size: int | None = None
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
max_model_len = vllm_config.model_config.max_model_len max_model_len = vllm_config.model_config.max_model_len
@@ -390,10 +393,11 @@ class KVCacheConfig:
The KV cache configuration of a model. The KV cache configuration of a model.
""" """
"""The number of KV cache blocks"""
num_blocks: int num_blocks: int
"""How should model runner initialize the KV cache tensors for each layer""" """The number of KV cache blocks"""
kv_cache_tensors: list[KVCacheTensor] kv_cache_tensors: list[KVCacheTensor]
"""How should model runner initialize the KV cache tensors for each layer"""
kv_cache_groups: list[KVCacheGroupSpec]
""" """
The kv cache groups of the model. The kv cache groups of the model.
For models with only one type of attention, there is only one group that For models with only one type of attention, there is only one group that
@@ -401,4 +405,3 @@ class KVCacheConfig:
For models with multiple types of attention, there will be multiple groups, For models with multiple types of attention, there will be multiple groups,
see `_get_kv_cache_config_uniform_page_size` for more details. see `_get_kv_cache_config_uniform_page_size` for more details.
""" """
kv_cache_groups: list[KVCacheGroupSpec]