[Bugfix] Fix Basic Models Test (#34818)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
Matthew Bonanni
2026-02-19 17:49:07 -05:00
committed by GitHub
parent 4fb8beefaa
commit 662205d34e
14 changed files with 175 additions and 221 deletions

View File

@@ -13,6 +13,7 @@ import torch.nn as nn
from PIL import Image
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.config.cache import CacheConfig
from vllm.config.multimodal import (
AudioDummyOptions,
BaseDummyOptions,
@@ -131,7 +132,9 @@ def initialize_dummy_model(
):
temp_file = tempfile.mkstemp()[1]
current_device = torch.get_default_device()
vllm_config = VllmConfig(model_config=model_config)
vllm_config = VllmConfig(
model_config=model_config, cache_config=CacheConfig(block_size=16)
)
with set_current_vllm_config(vllm_config=vllm_config):
init_distributed_environment(
world_size=1,

View File

@@ -457,6 +457,9 @@ def dummy_hf_overrides(
# Kimi uses `num_expert_group` instead of `n_group`.
if n_group is None:
n_group = getattr(text_config, "num_expert_group", None)
# InternS1Pro uses `router_n_groups` instead of `n_group`.
if n_group is None:
n_group = getattr(text_config, "router_n_groups", None)
num_experts = n_group * 2 if n_group is not None else 2
# we use three layers for Gemma-3n to check
@@ -486,12 +489,14 @@ def dummy_hf_overrides(
# Only set MoE related config when the model has MoE layers.
# Otherwise all models detected as MoE by _get_transformers_backend_cls.
if model_arch_config.num_experts > 0:
orig_topk = getattr(text_config, "num_experts_per_tok", 2)
topk = min(orig_topk, 2)
update_dict.update(
{
"num_experts": num_experts,
"num_experts_per_tok": 2,
"num_experts_per_tok": topk,
# Kimi uses `num_experts_per_token`.
"num_experts_per_token": 2,
"num_experts_per_token": topk,
"num_local_experts": num_experts,
# Otherwise there will not be any expert layers
"first_k_dense_replace": 0,

View File

@@ -78,7 +78,7 @@ def _create_proposer(
device = current_platform.device_type
vllm_config = VllmConfig(
model_config=model_config,
cache_config=CacheConfig(),
cache_config=CacheConfig(block_size=16),
speculative_config=speculative_config,
device_config=DeviceConfig(device=device),
parallel_config=ParallelConfig(),

View File

@@ -41,8 +41,8 @@ class CacheConfig:
block_size: SkipValidation[int] = None # type: ignore[assignment]
"""Size of a contiguous cache block in number of tokens.
This is None until `Platform.check_and_update_config()` sets it based on
the current platform. Always an int by the time the engine starts."""
This is None until the platform sets it. Always an int by the time
the engine starts."""
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
"""The fraction of GPU memory to be used for the model executor, which can
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory

View File

@@ -915,32 +915,6 @@ class VllmConfig:
)
current_platform.check_and_update_config(self)
# If DCP, ensure the block size is right.
if self.parallel_config.decode_context_parallel_size > 1:
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
self.parallel_config.cp_kv_cache_interleave_size
!= self.parallel_config.dcp_kv_cache_interleave_size
):
self.parallel_config.cp_kv_cache_interleave_size = (
self.parallel_config.dcp_kv_cache_interleave_size
)
logger.warning_once(
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
"_interleave_size. And dcp-kv-cache-interleave-size will be "
"deprecated when PCP is fully supported."
)
assert (
self.parallel_config.cp_kv_cache_interleave_size
<= self.cache_config.block_size
and self.cache_config.block_size
% self.parallel_config.cp_kv_cache_interleave_size
== 0
), (
f"Block_size({self.cache_config.block_size}) should be greater "
"than or equal to and divisible by cp_kv_cache_interleave_size "
f"({self.parallel_config.cp_kv_cache_interleave_size})."
)
# Do this after all the updates to compilation_config.mode
effective_dp_size = (
self.parallel_config.data_parallel_size
@@ -1108,26 +1082,6 @@ class VllmConfig:
# Default to enable HMA if not explicitly disabled by user or logic above.
self.scheduler_config.disable_hybrid_kv_cache_manager = False
if self.cache_config.mamba_cache_mode == "align":
assert (
self.cache_config.block_size
<= self.scheduler_config.max_num_batched_tokens
), (
"In Mamba cache align mode, block_size "
f"({self.cache_config.block_size}) must be <= "
"max_num_batched_tokens "
f"({self.scheduler_config.max_num_batched_tokens})."
)
if self.scheduler_config.long_prefill_token_threshold > 0:
assert (
self.scheduler_config.long_prefill_token_threshold
>= self.cache_config.block_size
)
assert not self.scheduler_config.disable_chunked_mm_input, (
"Chunked MM input is required because we need the flexibility to "
"schedule a multiple of block_size tokens even if they are in the "
"middle of a mm input"
)
if self.compilation_config.debug_dump_path:
self.compilation_config.debug_dump_path = (
self.compilation_config.debug_dump_path.absolute().expanduser()
@@ -1488,6 +1442,57 @@ class VllmConfig:
f"compilation_config={self.compilation_config!r}"
)
def validate_block_size(self) -> None:
"""Validate block_size against DCP and mamba constraints.
Called after Platform.update_block_size_for_backend() has
finalised block_size, so that the checks see the real value
rather than the initial None sentinel.
"""
block_size = self.cache_config.block_size
assert block_size is not None, (
"validate_block_size called before block_size was set"
)
# DCP interleave-size compatibility
if self.parallel_config.decode_context_parallel_size > 1:
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
self.parallel_config.cp_kv_cache_interleave_size
!= self.parallel_config.dcp_kv_cache_interleave_size
):
self.parallel_config.cp_kv_cache_interleave_size = (
self.parallel_config.dcp_kv_cache_interleave_size
)
logger.warning_once(
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
"_interleave_size. And dcp-kv-cache-interleave-size will be "
"deprecated when PCP is fully supported."
)
assert (
self.parallel_config.cp_kv_cache_interleave_size <= block_size
and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
), (
f"Block_size({block_size}) should be greater "
"than or equal to and divisible by cp_kv_cache_interleave_size "
f"({self.parallel_config.cp_kv_cache_interleave_size})."
)
# Mamba cache align-mode constraints
if self.cache_config.mamba_cache_mode == "align":
assert block_size <= self.scheduler_config.max_num_batched_tokens, (
"In Mamba cache align mode, block_size "
f"({block_size}) must be <= "
"max_num_batched_tokens "
f"({self.scheduler_config.max_num_batched_tokens})."
)
if self.scheduler_config.long_prefill_token_threshold > 0:
assert self.scheduler_config.long_prefill_token_threshold >= block_size
assert not self.scheduler_config.disable_chunked_mm_input, (
"Chunked MM input is required because we need the flexibility "
"to schedule a multiple of block_size tokens even if they are "
"in the middle of a mm input"
)
@model_validator(mode="after")
def validate_mamba_block_size(self) -> "VllmConfig":
if self.model_config is None:

View File

@@ -30,9 +30,8 @@ from vllm.v1.kv_cache_interface import (
def create_chunked_local_attention_backend(
underlying_attn_backend: AttentionBackend,
attention_chunk_size: int,
block_size: int,
) -> type[AttentionBackend]:
prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
prefix = f"ChunkedLocalAttention_{attention_chunk_size}_"
underlying_builder = underlying_attn_backend.get_builder_cls()
assert issubclass(underlying_builder, AttentionMetadataBuilder)
@@ -55,7 +54,9 @@ def create_chunked_local_attention_backend(
fast_build: bool = False,
):
cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
attention_chunk_size, common_attn_metadata, block_size
attention_chunk_size,
common_attn_metadata,
self.kv_cache_spec.block_size,
)
metadata = super().build(common_prefix_len, cm, fast_build)
metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
@@ -97,13 +98,13 @@ class ChunkedLocalAttention(Attention):
block_size = cache_config.block_size
else:
kv_cache_dtype = "auto"
block_size = 16
block_size = None
underlying_attn_backend = get_attn_backend(
head_size, dtype, kv_cache_dtype, block_size
)
attn_backend = create_chunked_local_attention_backend(
underlying_attn_backend, attention_chunk_size, block_size
underlying_attn_backend, attention_chunk_size
)
super().__init__(

View File

@@ -407,17 +407,24 @@ class MLAAttention(nn.Module, AttentionLayerBase):
)
# Attributes for forward_impl method
self.chunked_prefill_workspace_size = (
MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
get_current_vllm_config()
)
)
self._vllm_config = get_current_vllm_config()
self._chunked_prefill_workspace_size: int | None = None
self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8(
static=True,
group_shape=GroupShape.PER_TENSOR,
compile_native=True,
)
@property
def chunked_prefill_workspace_size(self) -> int:
if self._chunked_prefill_workspace_size is None:
self._chunked_prefill_workspace_size = (
MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
self._vllm_config
)
)
return self._chunked_prefill_workspace_size
def forward(
self,
q: torch.Tensor,

View File

@@ -169,21 +169,6 @@ class CudaPlatformBase(Platform):
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
cache_config = vllm_config.cache_config
user_specified_block_size = cache_config.block_size is not None
if not user_specified_block_size:
cache_config.block_size = 16
# Ensure block_size is compatible with the attention backend.
# Note: model_config may be None during testing.
# Skip hybrid (attention+mamba) models — their block_size is
# managed by HybridAttentionMambaModelConfig
if model_config is not None and not model_config.is_hybrid:
cls._update_block_size_for_backend(
vllm_config,
user_specified_block_size,
)
scheduler_config = vllm_config.scheduler_config
# Note: model_config may be None during testing
if (
@@ -199,148 +184,47 @@ class CudaPlatformBase(Platform):
scheduler_config.disable_chunked_mm_input = True
@classmethod
def _update_block_size_for_backend(
cls,
vllm_config: "VllmConfig",
user_specified_block_size: bool,
) -> None:
"""Ensure block_size is compatible with the attention backend.
If the user specified --block-size, the selector validates/filters
backends by that block size (raising on incompatibility). Otherwise,
the backend is selected unconstrained and block_size is set to the
backend's preferred value.
"""
from vllm.config.vllm import set_current_vllm_config
from vllm.v1.attention.selector import AttentionSelectorConfig
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
cache_config = vllm_config.cache_config
if cache_config.block_size is not None:
# User specified --block-size; keep it.
return
model_config = vllm_config.model_config
cache_config = vllm_config.cache_config
device_capability = cls.get_device_capability()
if device_capability is None:
# model_config may be None during testing.
# Skip hybrid models — their block_size is managed by
# HybridAttentionMambaModelConfig.
if model_config is None or model_config.is_hybrid:
cache_config.block_size = 16
return
use_mla = model_config.use_mla
attn_selector_config = AttentionSelectorConfig(
head_size=model_config.get_head_size(),
dtype=model_config.dtype, # type: ignore[arg-type]
kv_cache_dtype=cache_config.cache_dtype,
block_size=cache_config.block_size if user_specified_block_size else None,
use_mla=use_mla,
has_sink=False,
use_sparse=use_mla and hasattr(model_config.hf_config, "index_topk"),
use_mm_prefix=model_config.is_mm_prefix_lm,
from vllm.config.vllm import (
get_layers_from_vllm_config,
set_current_vllm_config,
)
from vllm.model_executor.layers.attention_layer_base import (
AttentionLayerBase,
)
user_specified_backend = vllm_config.attention_config.backend
num_heads = model_config.get_num_attention_heads(
vllm_config.parallel_config,
attn_layers = get_layers_from_vllm_config(
vllm_config,
AttentionLayerBase,
)
if not attn_layers:
cache_config.block_size = 16
return
first_layer = next(iter(attn_layers.values()))
backend_cls = first_layer.get_attn_backend()
with set_current_vllm_config(vllm_config):
chosen_backend = cls.select_attention_backend(
selected_backend=user_specified_backend,
attn_selector_config=attn_selector_config,
device_capability=device_capability,
# Don't raise here — we produce better errors below.
raise_on_invalid=False,
num_heads=num_heads,
)
# If the user's --block-size forced a non-optimal backend,
# warn them. Only relevant when the user didn't also specify
# --attention-backend (in which case the choice is explicit).
if (
chosen_backend is not None
and user_specified_block_size
and user_specified_backend is None
):
optimal = cls.select_attention_backend(
selected_backend=None,
attn_selector_config=attn_selector_config._replace(
block_size=None,
),
device_capability=device_capability,
raise_on_invalid=False,
num_heads=num_heads,
)
if optimal is not None and optimal != chosen_backend:
logger.warning(
"--block-size %d is not supported by the preferred "
"%s backend. Using %s instead, which may result "
"in reduced performance. Consider removing "
"--block-size to auto-select the optimal "
"block size.",
cache_config.block_size,
optimal.name,
chosen_backend.name,
)
if chosen_backend is not None:
if user_specified_block_size:
# User's block_size is compatible with the chosen
# backend.
return
# User didn't specify --block-size, so auto-select the
# preferred block size for the chosen backend.
try:
backend_class = chosen_backend.get_class()
except ImportError:
return # Will fail later with a better error
preferred = backend_class.get_preferred_block_size(
cache_config.block_size,
)
if cache_config.block_size != preferred:
preferred = backend_cls.get_preferred_block_size(16)
if preferred != 16:
logger.info(
"Setting kv cache block size to %d for %s backend.",
preferred,
chosen_backend.name,
backend_cls.get_name(),
)
cache_config.block_size = preferred
return
# No valid backend found. If the user didn't constrain the
# selection, defer the error to get_attn_backend_cls where
# the full config (including per-layer settings) is
# available.
if not user_specified_block_size:
return
if user_specified_backend is not None:
# User specified --block-size and --attention-backend
# and they are incompatible.
try:
backend_class = user_specified_backend.get_class()
supported = backend_class.get_supported_kernel_block_sizes()
except ImportError:
supported = None
raise ValueError(
f"User-specified --block-size "
f"{cache_config.block_size} is incompatible with "
f"the specified --attention-backend "
f"{user_specified_backend.name} (supported kernel "
f"block sizes: {supported}). Either remove "
f"--block-size to auto-select, or choose a "
f"compatible value."
)
else:
# User specified --block-size but no backend supports
# it.
_, invalid_reasons = cls.get_valid_backends(
device_capability=device_capability,
attn_selector_config=attn_selector_config,
num_heads=num_heads,
)
reasons_str = ", ".join(
f"{b.name}: [{', '.join(r)}]" for b, r in invalid_reasons.items()
)
raise ValueError(
f"No valid attention backend found for "
f"--block-size {cache_config.block_size}. "
f"Reasons: {{{reasons_str}}}. Either remove "
f"--block-size to auto-select, or choose a "
f"compatible value."
)
@classmethod
def get_current_memory_usage(
@@ -358,10 +242,10 @@ class CudaPlatformBase(Platform):
num_heads: int | None = None,
) -> tuple[
list[tuple["AttentionBackendEnum", int]],
dict["AttentionBackendEnum", list[str]],
dict["AttentionBackendEnum", tuple[int, list[str]]],
]:
valid_backends_priorities = []
invalid_reasons = {}
invalid_reasons: dict[AttentionBackendEnum, tuple[int, list[str]]] = {}
backend_priorities = _get_backend_priorities(
attn_selector_config.use_mla,
@@ -378,7 +262,7 @@ class CudaPlatformBase(Platform):
except ImportError:
invalid_reasons_i = ["ImportError"]
if invalid_reasons_i:
invalid_reasons[backend] = invalid_reasons_i
invalid_reasons[backend] = (priority, invalid_reasons_i)
else:
valid_backends_priorities.append((backend, priority))
@@ -439,7 +323,7 @@ class CudaPlatformBase(Platform):
"{"
+ ", ".join(
f"{backend.name}: [{', '.join(reasons)}]"
for backend, reasons in invalid_reasons.items()
for backend, (_, reasons) in invalid_reasons.items()
)
+ "}"
)
@@ -452,7 +336,30 @@ class CudaPlatformBase(Platform):
# Select the one with the highest priority (lowest index).
sorted_backends = sorted(valid_backends_priorities, key=lambda x: x[1])
return sorted_backends[0][0]
chosen_backend, chosen_priority = sorted_backends[0]
# If the user specified --block-size (but not --attention-backend),
# check whether that constraint precluded any higher-priority backends.
if attn_selector_config.block_size is not None:
excluded = [
backend
for backend, (priority, reasons) in invalid_reasons.items()
if priority < chosen_priority
and reasons == ["block_size not supported"]
]
if excluded:
names = ", ".join(b.name for b in excluded)
logger.warning(
"--block-size %d excluded higher-priority backend(s) "
"%s. Using %s instead, which may result in reduced "
"performance. Consider removing --block-size to "
"auto-select the optimal block size.",
attn_selector_config.block_size,
names,
chosen_backend.name,
)
return chosen_backend
@classmethod
def get_attn_backend_cls(
@@ -487,7 +394,7 @@ class CudaPlatformBase(Platform):
"{"
+ ", ".join(
f"{backend.name}: [{', '.join(reasons)}]"
for backend, reasons in invalid_reasons.items()
for backend, (_, reasons) in invalid_reasons.items()
)
+ "}"
)
@@ -499,7 +406,7 @@ class CudaPlatformBase(Platform):
logger.info_once(
"Using %s attention backend out of potential backends: %s",
chosen_backend.name,
tuple(b[0].name for b in valid_backends_priorities),
tuple(backend.name for backend, _ in valid_backends_priorities),
scope="local",
)

View File

@@ -406,6 +406,13 @@ class Platform:
"""
pass
@classmethod
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
"""
Ensure block_size is compatible with the attention backend.
"""
pass
@classmethod
def verify_model_arch(cls, model_arch: str) -> None:
"""

View File

@@ -114,7 +114,14 @@ class EngineCore:
num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
vllm_config
)
if kv_cache_config.kv_cache_groups:
vllm_config.cache_config.block_size = min(
g.kv_cache_spec.block_size for g in kv_cache_config.kv_cache_groups
)
elif vllm_config.cache_config.block_size is None:
# Attention-free models (encoder-only, SSM) — use default.
vllm_config.cache_config.block_size = 16
vllm_config.validate_block_size()
vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks))

View File

@@ -41,6 +41,7 @@ from vllm.distributed.parallel_state import (
)
from vllm.envs import enable_envs_cache
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.tracing import instrument, maybe_init_worker_tracer
from vllm.utils.network_utils import (
get_distributed_init_method,
@@ -579,6 +580,9 @@ class WorkerProc:
self._init_message_queues(input_shm_handle, vllm_config)
self.worker.load_model()
# Set block size based on the attention backends
current_platform.update_block_size_for_backend(vllm_config)
# Enable environment variable cache (e.g. assume no more
# environment variable overrides after this point)
enable_envs_cache()

View File

@@ -385,6 +385,11 @@ class RayDistributedExecutor(Executor):
self.collective_rpc("init_device")
self.collective_rpc("load_model")
def _update_block_size(worker):
current_platform.update_block_size_for_backend(worker.vllm_config)
self.collective_rpc(_update_block_size)
for pp_rank in range(self.parallel_config.pipeline_parallel_size):
self.pp_tp_workers.append([])
for tp_rank in range(self.parallel_config.tensor_parallel_size):

View File

@@ -12,6 +12,7 @@ import torch.distributed as dist
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
@@ -46,6 +47,7 @@ class UniProcExecutor(Executor):
self.driver_worker.init_worker(all_kwargs=[kwargs])
self.driver_worker.init_device()
self.driver_worker.load_model()
current_platform.update_block_size_for_backend(self.vllm_config)
def _distributed_args(self) -> tuple[str, int, int]:
"""Return (distributed_init_method, rank, local_rank)."""

View File

@@ -513,6 +513,7 @@ class GPUModelRunner(
custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = (
tuple(logits_processors) if logits_processors is not None else ()
)
placeholder_block_size = self.cache_config.block_size or 16
self.input_batch = InputBatch(
max_num_reqs=self.max_num_reqs,
# We need to use the encoder length for encoder-decoer
@@ -522,8 +523,8 @@ class GPUModelRunner(
device=self.device,
pin_memory=self.pin_memory,
vocab_size=self.model_config.get_vocab_size(),
block_sizes=[self.cache_config.block_size],
kernel_block_sizes=[self.cache_config.block_size],
block_sizes=[placeholder_block_size],
kernel_block_sizes=[placeholder_block_size],
is_spec_decode=bool(self.vllm_config.speculative_config),
logitsprocs=build_logitsprocs(
self.vllm_config,