diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 53434b0b4..43fe49037 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -577,48 +577,6 @@ def test_compile_sizes_padding_validation(): dispatcher.initialize_cudagraph_keys(CUDAGraphMode.NONE) # Should not raise -@pytest.mark.parametrize( - "capture_sizes, max_size, num_blocks, expected_sizes, expected_max", - [ - # Normal capping: sizes filtered to <= num_blocks - ( - [1, 2, 4, 8, 16, 32, 64, 128, 256, 512], - 512, - 200, - [1, 2, 4, 8, 16, 32, 64, 128], - 128, - ), - # No capping needed: num_blocks >= max - ([1, 2, 4, 8, 16], 16, 1000, [1, 2, 4, 8, 16], 16), - # Exact boundary: num_blocks == max (no capping) - ([1, 2, 4, 8, 16, 32], 32, 32, [1, 2, 4, 8, 16, 32], 32), - # All sizes capped: num_blocks < smallest size - ([8, 16, 32], 32, 4, [], 0), - # num_blocks <= 0: early return, no change - ([1, 2, 4], 4, 0, [1, 2, 4], 4), - ], -) -def test_adjust_cudagraph_sizes_for_mamba_cache( - capture_sizes, max_size, num_blocks, expected_sizes, expected_max -): - """Test that cudagraph capture sizes are correctly capped to fit - available Mamba cache blocks. - - See: https://github.com/vllm-project/vllm/issues/34094 - """ - config = CompilationConfig( - cudagraph_capture_sizes=capture_sizes, - max_cudagraph_capture_size=max_size, - cudagraph_mode=CUDAGraphMode.NONE, - ) - config.adjust_cudagraph_sizes_for_mamba_cache(num_blocks) - assert config.cudagraph_capture_sizes == expected_sizes - assert config.max_cudagraph_capture_size == expected_max - # Invariant: last element == max_cudagraph_capture_size - if expected_sizes: - assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size - - def test_inductor_asserts_default_disabled(monkeypatch): """Test that inductor runtime asserts are disabled by default (INFO logging level) on torch < 2.12.""" diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 93c5435e8..02fac6dfc 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1191,9 +1191,9 @@ def test_is_uniform_decode() -> None: current_platform.is_rocm(), reason="Attention backend FLASHINFER is not supported on ROCm.", ) -def test_cudagraph_sizes_capped_for_mamba_cache(): - """Test that cudagraph capture sizes are capped to num_blocks for - hybrid models with Mamba layers. +def test_mamba_cache_raises_when_max_num_seqs_exceeds_blocks(): + """Test that a ValueError is raised when max_num_seqs exceeds the + available Mamba cache blocks for hybrid models with FULL cudagraphs. See: https://github.com/vllm-project/vllm/issues/34094 """ @@ -1284,23 +1284,8 @@ def test_cudagraph_sizes_capped_for_mamba_cache(): )[0] num_blocks = kv_cache_config.num_blocks - # Set max_cudagraph_capture_size to a value larger than num_blocks - # to trigger the Mamba capping logic. - large_max = num_blocks + 100 - compilation_config = vllm_config.compilation_config - compilation_config.max_cudagraph_capture_size = large_max - compilation_config.cudagraph_capture_sizes = [ - s for s in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] if s <= large_max - ] + # Force max_num_seqs to exceed num_blocks so the check triggers. + runner.max_num_reqs = num_blocks + 100 - runner.initialize_kv_cache(kv_cache_config) - - # After initialization, cudagraph sizes should be capped - assert compilation_config.max_cudagraph_capture_size <= num_blocks - assert all(s <= num_blocks for s in compilation_config.cudagraph_capture_sizes) - # Invariant: last element == max - if compilation_config.cudagraph_capture_sizes: - assert ( - compilation_config.cudagraph_capture_sizes[-1] - == compilation_config.max_cudagraph_capture_size - ) + with pytest.raises(ValueError, match="max_num_seqs"): + runner.initialize_kv_cache(kv_cache_config) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 5b6648908..1d09e2b7d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -1279,58 +1279,6 @@ class CompilationConfig: self.max_cudagraph_capture_size = rounded_sizes[-1] self.cudagraph_capture_sizes = rounded_sizes - def adjust_cudagraph_sizes_for_mamba_cache( - self, num_mamba_cache_blocks: int - ) -> None: - """Cap cudagraph capture sizes to available Mamba cache blocks. - - For hybrid Mamba/attention models, the Mamba conv_state and - ssm_state tensors have their first dimension equal to num_blocks - (from KVCacheConfig). During CUDA graph capture the decode batch - size equals num_tokens, so capture sizes exceeding num_blocks - would cause out-of-bounds access in Mamba kernels. - - See: https://github.com/vllm-project/vllm/issues/34094 - """ - if not self.cudagraph_capture_sizes or num_mamba_cache_blocks <= 0: - return - - assert self.max_cudagraph_capture_size is not None - - if num_mamba_cache_blocks >= self.max_cudagraph_capture_size: - return - - capped_sizes = [ - s for s in self.cudagraph_capture_sizes if s <= num_mamba_cache_blocks - ] - - if len(capped_sizes) == 0: - logger.warning( - "No valid cudagraph capture sizes remain after capping " - "to Mamba cache blocks (%d). The smallest capture size " - "was %d. Disabling cudagraph capture. Consider reducing " - "max_num_seqs or increasing available GPU memory.", - num_mamba_cache_blocks, - self.cudagraph_capture_sizes[0], - ) - self.cudagraph_capture_sizes = [] - self.max_cudagraph_capture_size = 0 - return - - logger.warning( - "Capping cudagraph capture sizes from max %d to %d to fit " - "Mamba cache blocks (%d blocks available). This limits the " - "maximum batch size that can use CUDA graphs. To increase " - "this limit, reduce max_num_seqs or increase available GPU " - "memory.", - self.max_cudagraph_capture_size, - capped_sizes[-1], - num_mamba_cache_blocks, - ) - - self.max_cudagraph_capture_size = capped_sizes[-1] - self.cudagraph_capture_sizes = capped_sizes - def get_compile_ranges(self) -> list[Range]: """Get the compile ranges for the compilation config.""" if self.compile_ranges_endpoints is None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8a43f43d0..8cfa61baa 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5800,7 +5800,7 @@ class GPUModelRunner( ) self.cache_config.num_gpu_blocks_override = saved_override - self.initialize_kv_cache(minimal_config) + self.initialize_kv_cache(minimal_config, is_profiling=True) self.cache_config.num_gpu_blocks = minimal_config.num_blocks logger.debug("Initialized minimal KV cache for CUDA graph profiling") @@ -6121,7 +6121,11 @@ class GPUModelRunner( torch.accelerator.synchronize() self.maybe_remove_all_loras(self.lora_config) - def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: + def initialize_attn_backend( + self, + kv_cache_config: KVCacheConfig, + is_profiling: bool = False, + ) -> None: """ Initialize the attention backends and attention metadata builders. """ @@ -6193,7 +6197,9 @@ class GPUModelRunner( # Resolve cudagraph_mode before actually initialize metadata_builders self._check_and_update_cudagraph_mode( - attention_backend_list, kv_cache_config.kv_cache_groups + attention_backend_list, + kv_cache_config.kv_cache_groups, + is_profiling=is_profiling, ) # Check if attention backend supports PCP&DCP and related features. @@ -6237,6 +6243,7 @@ class GPUModelRunner( self, attention_backends: list[set[type[AttentionBackend]]], kv_cache_groups: list[KVCacheGroupSpec], + is_profiling: bool = False, ) -> None: """ Resolve the cudagraph_mode when there are multiple attention @@ -6377,21 +6384,29 @@ class GPUModelRunner( self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size ) - # If the model has Mamba layers and cudagraph mode includes FULL - # decode, cap cudagraph capture sizes to the number of available - # Mamba cache blocks. Each decode request needs one conv_state - # cache line, so capture batch sizes cannot exceed num_blocks. - # Only FULL decode graphs are affected because PIECEWISE captures - # run GDN/Mamba ops eagerly (prefill path, no causal_conv1d_update). + # For Mamba models with FULL decode cudagraphs, each decode + # sequence needs one Mamba cache block. The decode cudagraph + # dispatcher already caps batch sizes at max_num_seqs, so we just + # need to verify that enough blocks exist. Raising here instead + # of silently capping cudagraph_capture_sizes avoids unintended + # restrictions on PIECEWISE (prefill) cudagraphs. # See: https://github.com/vllm-project/vllm/issues/34094 - if cudagraph_mode.has_full_cudagraphs(): + if cudagraph_mode.has_full_cudagraphs() and not is_profiling: has_mamba = any( isinstance(g.kv_cache_spec, MambaSpec) for g in kv_cache_groups ) if has_mamba and self.kv_cache_config is not None: - self.compilation_config.adjust_cudagraph_sizes_for_mamba_cache( - self.kv_cache_config.num_blocks - ) + num_blocks = self.kv_cache_config.num_blocks + if self.max_num_reqs > num_blocks: + raise ValueError( + f"max_num_seqs ({self.max_num_reqs}) exceeds " + f"available Mamba cache blocks ({num_blocks}). " + f"Each decode sequence requires one Mamba cache " + f"block, so CUDA graph capture cannot proceed. " + f"Please lower max_num_seqs to at most " + f"{num_blocks} or increase " + f"gpu_memory_utilization." + ) # Trigger cudagraph dispatching keys initialization after # resolved cudagraph mode. @@ -6752,7 +6767,11 @@ class GPUModelRunner( else: break - def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: + def initialize_kv_cache( + self, + kv_cache_config: KVCacheConfig, + is_profiling: bool = False, + ) -> None: """ Initialize KV cache based on `kv_cache_config`. Args: @@ -6764,7 +6783,7 @@ class GPUModelRunner( self._mamba_copy_bufs = None self.may_add_encoder_only_layers_to_kv_cache_config() self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config) - self.initialize_attn_backend(kv_cache_config) + self.initialize_attn_backend(kv_cache_config, is_profiling=is_profiling) # The kernel block size for all KV cache groups. For example, if # kv_cache_manager uses block_size 256 for a given group, but the attention # backends for that group only supports block_size 64, we will return