[Bugfix] Fix Basic Models Test (#34818)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-19 17:49:07 -05:00
parent 4fb8beefaa
commit 662205d34e
14 changed files with 175 additions and 221 deletions
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -13,6 +13,7 @@ import torch.nn as nn
 from PIL import Image

 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.config.cache import CacheConfig
 from vllm.config.multimodal import (
    AudioDummyOptions,
    BaseDummyOptions,
@@ -131,7 +132,9 @@ def initialize_dummy_model(
 ):
    temp_file = tempfile.mkstemp()[1]
    current_device = torch.get_default_device()
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config, cache_config=CacheConfig(block_size=16)
+    )
    with set_current_vllm_config(vllm_config=vllm_config):
        init_distributed_environment(
            world_size=1,
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -457,6 +457,9 @@ def dummy_hf_overrides(
    # Kimi uses `num_expert_group` instead of `n_group`.
    if n_group is None:
        n_group = getattr(text_config, "num_expert_group", None)
+    # InternS1Pro uses `router_n_groups` instead of `n_group`.
+    if n_group is None:
+        n_group = getattr(text_config, "router_n_groups", None)
    num_experts = n_group * 2 if n_group is not None else 2

    # we use three layers for Gemma-3n to check
@@ -486,12 +489,14 @@ def dummy_hf_overrides(
    # Only set MoE related config when the model has MoE layers.
    # Otherwise all models detected as MoE by _get_transformers_backend_cls.
    if model_arch_config.num_experts > 0:
+        orig_topk = getattr(text_config, "num_experts_per_tok", 2)
+        topk = min(orig_topk, 2)
        update_dict.update(
            {
                "num_experts": num_experts,
-                "num_experts_per_tok": 2,
+                "num_experts_per_tok": topk,
                # Kimi uses `num_experts_per_token`.
-                "num_experts_per_token": 2,
+                "num_experts_per_token": topk,
                "num_local_experts": num_experts,
                # Otherwise there will not be any expert layers
                "first_k_dense_replace": 0,
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -78,7 +78,7 @@ def _create_proposer(
    device = current_platform.device_type
    vllm_config = VllmConfig(
        model_config=model_config,
-        cache_config=CacheConfig(),
+        cache_config=CacheConfig(block_size=16),
        speculative_config=speculative_config,
        device_config=DeviceConfig(device=device),
        parallel_config=ParallelConfig(),
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -41,8 +41,8 @@ class CacheConfig:
    block_size: SkipValidation[int] = None  # type: ignore[assignment]
    """Size of a contiguous cache block in number of tokens.

-    This is None until `Platform.check_and_update_config()` sets it based on
-    the current platform. Always an int by the time the engine starts."""
+    This is None until the platform sets it. Always an int by the time
+    the engine starts."""
    gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
    """The fraction of GPU memory to be used for the model executor, which can
    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -915,32 +915,6 @@ class VllmConfig:
            )
        current_platform.check_and_update_config(self)

-        # If DCP, ensure the block size is right.
-        if self.parallel_config.decode_context_parallel_size > 1:
-            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
-                self.parallel_config.cp_kv_cache_interleave_size
-                != self.parallel_config.dcp_kv_cache_interleave_size
-            ):
-                self.parallel_config.cp_kv_cache_interleave_size = (
-                    self.parallel_config.dcp_kv_cache_interleave_size
-                )
-                logger.warning_once(
-                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
-                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
-                    "deprecated when PCP is fully supported."
-                )
-            assert (
-                self.parallel_config.cp_kv_cache_interleave_size
-                <= self.cache_config.block_size
-                and self.cache_config.block_size
-                % self.parallel_config.cp_kv_cache_interleave_size
-                == 0
-            ), (
-                f"Block_size({self.cache_config.block_size}) should be greater "
-                "than or equal to and divisible by cp_kv_cache_interleave_size "
-                f"({self.parallel_config.cp_kv_cache_interleave_size})."
-            )
-
        # Do this after all the updates to compilation_config.mode
        effective_dp_size = (
            self.parallel_config.data_parallel_size
@@ -1108,26 +1082,6 @@ class VllmConfig:
            # Default to enable HMA if not explicitly disabled by user or logic above.
            self.scheduler_config.disable_hybrid_kv_cache_manager = False

-        if self.cache_config.mamba_cache_mode == "align":
-            assert (
-                self.cache_config.block_size
-                <= self.scheduler_config.max_num_batched_tokens
-            ), (
-                "In Mamba cache align mode, block_size "
-                f"({self.cache_config.block_size}) must be <= "
-                "max_num_batched_tokens "
-                f"({self.scheduler_config.max_num_batched_tokens})."
-            )
-            if self.scheduler_config.long_prefill_token_threshold > 0:
-                assert (
-                    self.scheduler_config.long_prefill_token_threshold
-                    >= self.cache_config.block_size
-                )
-            assert not self.scheduler_config.disable_chunked_mm_input, (
-                "Chunked MM input is required because we need the flexibility to "
-                "schedule a multiple of block_size tokens even if they are in the "
-                "middle of a mm input"
-            )
        if self.compilation_config.debug_dump_path:
            self.compilation_config.debug_dump_path = (
                self.compilation_config.debug_dump_path.absolute().expanduser()
@@ -1488,6 +1442,57 @@ class VllmConfig:
            f"compilation_config={self.compilation_config!r}"
        )

+    def validate_block_size(self) -> None:
+        """Validate block_size against DCP and mamba constraints.
+
+        Called after Platform.update_block_size_for_backend() has
+        finalised block_size, so that the checks see the real value
+        rather than the initial None sentinel.
+        """
+        block_size = self.cache_config.block_size
+        assert block_size is not None, (
+            "validate_block_size called before block_size was set"
+        )
+
+        # DCP interleave-size compatibility
+        if self.parallel_config.decode_context_parallel_size > 1:
+            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
+                self.parallel_config.cp_kv_cache_interleave_size
+                != self.parallel_config.dcp_kv_cache_interleave_size
+            ):
+                self.parallel_config.cp_kv_cache_interleave_size = (
+                    self.parallel_config.dcp_kv_cache_interleave_size
+                )
+                logger.warning_once(
+                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
+                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
+                    "deprecated when PCP is fully supported."
+                )
+            assert (
+                self.parallel_config.cp_kv_cache_interleave_size <= block_size
+                and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
+            ), (
+                f"Block_size({block_size}) should be greater "
+                "than or equal to and divisible by cp_kv_cache_interleave_size "
+                f"({self.parallel_config.cp_kv_cache_interleave_size})."
+            )
+
+        # Mamba cache align-mode constraints
+        if self.cache_config.mamba_cache_mode == "align":
+            assert block_size <= self.scheduler_config.max_num_batched_tokens, (
+                "In Mamba cache align mode, block_size "
+                f"({block_size}) must be <= "
+                "max_num_batched_tokens "
+                f"({self.scheduler_config.max_num_batched_tokens})."
+            )
+            if self.scheduler_config.long_prefill_token_threshold > 0:
+                assert self.scheduler_config.long_prefill_token_threshold >= block_size
+            assert not self.scheduler_config.disable_chunked_mm_input, (
+                "Chunked MM input is required because we need the flexibility "
+                "to schedule a multiple of block_size tokens even if they are "
+                "in the middle of a mm input"
+            )
+
    @model_validator(mode="after")
    def validate_mamba_block_size(self) -> "VllmConfig":
        if self.model_config is None:
--- a/vllm/model_executor/layers/attention/chunked_local_attention.py
+++ b/vllm/model_executor/layers/attention/chunked_local_attention.py
@@ -30,9 +30,8 @@ from vllm.v1.kv_cache_interface import (
 def create_chunked_local_attention_backend(
    underlying_attn_backend: AttentionBackend,
    attention_chunk_size: int,
-    block_size: int,
 ) -> type[AttentionBackend]:
-    prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
+    prefix = f"ChunkedLocalAttention_{attention_chunk_size}_"

    underlying_builder = underlying_attn_backend.get_builder_cls()
    assert issubclass(underlying_builder, AttentionMetadataBuilder)
@@ -55,7 +54,9 @@ def create_chunked_local_attention_backend(
            fast_build: bool = False,
        ):
            cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
-                attention_chunk_size, common_attn_metadata, block_size
+                attention_chunk_size,
+                common_attn_metadata,
+                self.kv_cache_spec.block_size,
            )
            metadata = super().build(common_prefix_len, cm, fast_build)
            metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
@@ -97,13 +98,13 @@ class ChunkedLocalAttention(Attention):
            block_size = cache_config.block_size
        else:
            kv_cache_dtype = "auto"
-            block_size = 16
+            block_size = None

        underlying_attn_backend = get_attn_backend(
            head_size, dtype, kv_cache_dtype, block_size
        )
        attn_backend = create_chunked_local_attention_backend(
-            underlying_attn_backend, attention_chunk_size, block_size
+            underlying_attn_backend, attention_chunk_size
        )

        super().__init__(
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -407,17 +407,24 @@ class MLAAttention(nn.Module, AttentionLayerBase):
        )

        # Attributes for forward_impl method
-        self.chunked_prefill_workspace_size = (
-            MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
-                get_current_vllm_config()
-            )
-        )
+        self._vllm_config = get_current_vllm_config()
+        self._chunked_prefill_workspace_size: int | None = None
        self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8(
            static=True,
            group_shape=GroupShape.PER_TENSOR,
            compile_native=True,
        )

+    @property
+    def chunked_prefill_workspace_size(self) -> int:
+        if self._chunked_prefill_workspace_size is None:
+            self._chunked_prefill_workspace_size = (
+                MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
+                    self._vllm_config
+                )
+            )
+        return self._chunked_prefill_workspace_size
+
    def forward(
        self,
        q: torch.Tensor,
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -169,21 +169,6 @@ class CudaPlatformBase(Platform):
        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"

-        cache_config = vllm_config.cache_config
-        user_specified_block_size = cache_config.block_size is not None
-        if not user_specified_block_size:
-            cache_config.block_size = 16
-
-        # Ensure block_size is compatible with the attention backend.
-        # Note: model_config may be None during testing.
-        # Skip hybrid (attention+mamba) models — their block_size is
-        # managed by HybridAttentionMambaModelConfig
-        if model_config is not None and not model_config.is_hybrid:
-            cls._update_block_size_for_backend(
-                vllm_config,
-                user_specified_block_size,
-            )
-
        scheduler_config = vllm_config.scheduler_config
        # Note: model_config may be None during testing
        if (
@@ -199,148 +184,47 @@ class CudaPlatformBase(Platform):
            scheduler_config.disable_chunked_mm_input = True

    @classmethod
-    def _update_block_size_for_backend(
-        cls,
-        vllm_config: "VllmConfig",
-        user_specified_block_size: bool,
-    ) -> None:
-        """Ensure block_size is compatible with the attention backend.
-
-        If the user specified --block-size, the selector validates/filters
-        backends by that block size (raising on incompatibility). Otherwise,
-        the backend is selected unconstrained and block_size is set to the
-        backend's preferred value.
-        """
-        from vllm.config.vllm import set_current_vllm_config
-        from vllm.v1.attention.selector import AttentionSelectorConfig
-
-        model_config = vllm_config.model_config
+    def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
        cache_config = vllm_config.cache_config
-
-        device_capability = cls.get_device_capability()
-        if device_capability is None:
+        if cache_config.block_size is not None:
+            # User specified --block-size; keep it.
            return

-        use_mla = model_config.use_mla
-        attn_selector_config = AttentionSelectorConfig(
-            head_size=model_config.get_head_size(),
-            dtype=model_config.dtype,  # type: ignore[arg-type]
-            kv_cache_dtype=cache_config.cache_dtype,
-            block_size=cache_config.block_size if user_specified_block_size else None,
-            use_mla=use_mla,
-            has_sink=False,
-            use_sparse=use_mla and hasattr(model_config.hf_config, "index_topk"),
-            use_mm_prefix=model_config.is_mm_prefix_lm,
+        model_config = vllm_config.model_config
+        # model_config may be None during testing.
+        # Skip hybrid models — their block_size is managed by
+        # HybridAttentionMambaModelConfig.
+        if model_config is None or model_config.is_hybrid:
+            cache_config.block_size = 16
+            return
+
+        from vllm.config.vllm import (
+            get_layers_from_vllm_config,
+            set_current_vllm_config,
+        )
+        from vllm.model_executor.layers.attention_layer_base import (
+            AttentionLayerBase,
        )

-        user_specified_backend = vllm_config.attention_config.backend
-        num_heads = model_config.get_num_attention_heads(
-            vllm_config.parallel_config,
+        attn_layers = get_layers_from_vllm_config(
+            vllm_config,
+            AttentionLayerBase,
        )
+        if not attn_layers:
+            cache_config.block_size = 16
+            return
+
+        first_layer = next(iter(attn_layers.values()))
+        backend_cls = first_layer.get_attn_backend()
        with set_current_vllm_config(vllm_config):
-            chosen_backend = cls.select_attention_backend(
-                selected_backend=user_specified_backend,
-                attn_selector_config=attn_selector_config,
-                device_capability=device_capability,
-                # Don't raise here — we produce better errors below.
-                raise_on_invalid=False,
-                num_heads=num_heads,
+            preferred = backend_cls.get_preferred_block_size(16)
+        if preferred != 16:
+            logger.info(
+                "Setting kv cache block size to %d for %s backend.",
+                preferred,
+                backend_cls.get_name(),
            )
-
-            # If the user's --block-size forced a non-optimal backend,
-            # warn them. Only relevant when the user didn't also specify
-            # --attention-backend (in which case the choice is explicit).
-            if (
-                chosen_backend is not None
-                and user_specified_block_size
-                and user_specified_backend is None
-            ):
-                optimal = cls.select_attention_backend(
-                    selected_backend=None,
-                    attn_selector_config=attn_selector_config._replace(
-                        block_size=None,
-                    ),
-                    device_capability=device_capability,
-                    raise_on_invalid=False,
-                    num_heads=num_heads,
-                )
-                if optimal is not None and optimal != chosen_backend:
-                    logger.warning(
-                        "--block-size %d is not supported by the preferred "
-                        "%s backend. Using %s instead, which may result "
-                        "in reduced performance. Consider removing "
-                        "--block-size to auto-select the optimal "
-                        "block size.",
-                        cache_config.block_size,
-                        optimal.name,
-                        chosen_backend.name,
-                    )
-
-            if chosen_backend is not None:
-                if user_specified_block_size:
-                    # User's block_size is compatible with the chosen
-                    # backend.
-                    return
-                # User didn't specify --block-size, so auto-select the
-                # preferred block size for the chosen backend.
-                try:
-                    backend_class = chosen_backend.get_class()
-                except ImportError:
-                    return  # Will fail later with a better error
-                preferred = backend_class.get_preferred_block_size(
-                    cache_config.block_size,
-                )
-                if cache_config.block_size != preferred:
-                    logger.info(
-                        "Setting kv cache block size to %d for %s backend.",
-                        preferred,
-                        chosen_backend.name,
-                    )
-                    cache_config.block_size = preferred
-                return
-
-            # No valid backend found. If the user didn't constrain the
-            # selection, defer the error to get_attn_backend_cls where
-            # the full config (including per-layer settings) is
-            # available.
-            if not user_specified_block_size:
-                return
-
-            if user_specified_backend is not None:
-                # User specified --block-size and --attention-backend
-                # and they are incompatible.
-                try:
-                    backend_class = user_specified_backend.get_class()
-                    supported = backend_class.get_supported_kernel_block_sizes()
-                except ImportError:
-                    supported = None
-                raise ValueError(
-                    f"User-specified --block-size "
-                    f"{cache_config.block_size} is incompatible with "
-                    f"the specified --attention-backend "
-                    f"{user_specified_backend.name} (supported kernel "
-                    f"block sizes: {supported}). Either remove "
-                    f"--block-size to auto-select, or choose a "
-                    f"compatible value."
-                )
-            else:
-                # User specified --block-size but no backend supports
-                # it.
-                _, invalid_reasons = cls.get_valid_backends(
-                    device_capability=device_capability,
-                    attn_selector_config=attn_selector_config,
-                    num_heads=num_heads,
-                )
-                reasons_str = ", ".join(
-                    f"{b.name}: [{', '.join(r)}]" for b, r in invalid_reasons.items()
-                )
-                raise ValueError(
-                    f"No valid attention backend found for "
-                    f"--block-size {cache_config.block_size}. "
-                    f"Reasons: {{{reasons_str}}}. Either remove "
-                    f"--block-size to auto-select, or choose a "
-                    f"compatible value."
-                )
+        cache_config.block_size = preferred

    @classmethod
    def get_current_memory_usage(
@@ -358,10 +242,10 @@ class CudaPlatformBase(Platform):
        num_heads: int | None = None,
    ) -> tuple[
        list[tuple["AttentionBackendEnum", int]],
-        dict["AttentionBackendEnum", list[str]],
+        dict["AttentionBackendEnum", tuple[int, list[str]]],
    ]:
        valid_backends_priorities = []
-        invalid_reasons = {}
+        invalid_reasons: dict[AttentionBackendEnum, tuple[int, list[str]]] = {}

        backend_priorities = _get_backend_priorities(
            attn_selector_config.use_mla,
@@ -378,7 +262,7 @@ class CudaPlatformBase(Platform):
            except ImportError:
                invalid_reasons_i = ["ImportError"]
            if invalid_reasons_i:
-                invalid_reasons[backend] = invalid_reasons_i
+                invalid_reasons[backend] = (priority, invalid_reasons_i)
            else:
                valid_backends_priorities.append((backend, priority))

@@ -439,7 +323,7 @@ class CudaPlatformBase(Platform):
                    "{"
                    + ", ".join(
                        f"{backend.name}: [{', '.join(reasons)}]"
-                        for backend, reasons in invalid_reasons.items()
+                        for backend, (_, reasons) in invalid_reasons.items()
                    )
                    + "}"
                )
@@ -452,7 +336,30 @@ class CudaPlatformBase(Platform):

        # Select the one with the highest priority (lowest index).
        sorted_backends = sorted(valid_backends_priorities, key=lambda x: x[1])
-        return sorted_backends[0][0]
+        chosen_backend, chosen_priority = sorted_backends[0]
+
+        # If the user specified --block-size (but not --attention-backend),
+        # check whether that constraint precluded any higher-priority backends.
+        if attn_selector_config.block_size is not None:
+            excluded = [
+                backend
+                for backend, (priority, reasons) in invalid_reasons.items()
+                if priority < chosen_priority
+                and reasons == ["block_size not supported"]
+            ]
+            if excluded:
+                names = ", ".join(b.name for b in excluded)
+                logger.warning(
+                    "--block-size %d excluded higher-priority backend(s) "
+                    "%s. Using %s instead, which may result in reduced "
+                    "performance. Consider removing --block-size to "
+                    "auto-select the optimal block size.",
+                    attn_selector_config.block_size,
+                    names,
+                    chosen_backend.name,
+                )
+
+        return chosen_backend

    @classmethod
    def get_attn_backend_cls(
@@ -487,7 +394,7 @@ class CudaPlatformBase(Platform):
                "{"
                + ", ".join(
                    f"{backend.name}: [{', '.join(reasons)}]"
-                    for backend, reasons in invalid_reasons.items()
+                    for backend, (_, reasons) in invalid_reasons.items()
                )
                + "}"
            )
@@ -499,7 +406,7 @@ class CudaPlatformBase(Platform):
            logger.info_once(
                "Using %s attention backend out of potential backends: %s",
                chosen_backend.name,
-                tuple(b[0].name for b in valid_backends_priorities),
+                tuple(backend.name for backend, _ in valid_backends_priorities),
                scope="local",
            )

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -406,6 +406,13 @@ class Platform:
        """
        pass

+    @classmethod
+    def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Ensure block_size is compatible with the attention backend.
+        """
+        pass
+
    @classmethod
    def verify_model_arch(cls, model_arch: str) -> None:
        """
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -114,7 +114,14 @@ class EngineCore:
        num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
            vllm_config
        )
-
+        if kv_cache_config.kv_cache_groups:
+            vllm_config.cache_config.block_size = min(
+                g.kv_cache_spec.block_size for g in kv_cache_config.kv_cache_groups
+            )
+        elif vllm_config.cache_config.block_size is None:
+            # Attention-free models (encoder-only, SSM) — use default.
+            vllm_config.cache_config.block_size = 16
+        vllm_config.validate_block_size()
        vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
        vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks))
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -41,6 +41,7 @@ from vllm.distributed.parallel_state import (
 )
 from vllm.envs import enable_envs_cache
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.tracing import instrument, maybe_init_worker_tracer
 from vllm.utils.network_utils import (
    get_distributed_init_method,
@@ -579,6 +580,9 @@ class WorkerProc:
        self._init_message_queues(input_shm_handle, vllm_config)
        self.worker.load_model()

+        # Set block size based on the attention backends
+        current_platform.update_block_size_for_backend(vllm_config)
+
        # Enable environment variable cache (e.g. assume no more
        # environment variable overrides after this point)
        enable_envs_cache()
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -385,6 +385,11 @@ class RayDistributedExecutor(Executor):
        self.collective_rpc("init_device")
        self.collective_rpc("load_model")

+        def _update_block_size(worker):
+            current_platform.update_block_size_for_backend(worker.vllm_config)
+
+        self.collective_rpc(_update_block_size)
+
        for pp_rank in range(self.parallel_config.pipeline_parallel_size):
            self.pp_tp_workers.append([])
            for tp_rank in range(self.parallel_config.tensor_parallel_size):
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -12,6 +12,7 @@ import torch.distributed as dist

 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
@@ -46,6 +47,7 @@ class UniProcExecutor(Executor):
        self.driver_worker.init_worker(all_kwargs=[kwargs])
        self.driver_worker.init_device()
        self.driver_worker.load_model()
+        current_platform.update_block_size_for_backend(self.vllm_config)

    def _distributed_args(self) -> tuple[str, int, int]:
        """Return (distributed_init_method, rank, local_rank)."""
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -513,6 +513,7 @@ class GPUModelRunner(
        custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = (
            tuple(logits_processors) if logits_processors is not None else ()
        )
+        placeholder_block_size = self.cache_config.block_size or 16
        self.input_batch = InputBatch(
            max_num_reqs=self.max_num_reqs,
            # We need to use the encoder length for encoder-decoer
@@ -522,8 +523,8 @@ class GPUModelRunner(
            device=self.device,
            pin_memory=self.pin_memory,
            vocab_size=self.model_config.get_vocab_size(),
-            block_sizes=[self.cache_config.block_size],
-            kernel_block_sizes=[self.cache_config.block_size],
+            block_sizes=[placeholder_block_size],
+            kernel_block_sizes=[placeholder_block_size],
            is_spec_decode=bool(self.vllm_config.speculative_config),
            logitsprocs=build_logitsprocs(
                self.vllm_config,