[Bugfix] Fix use_cascade_attention handling for Alibi-based models on vllm/v1 (#15211)

Signed-off-by: h-sugi <h.sugi@ieee.org>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Hiroaki Sugiyama
2025-03-27 23:29:29 +09:00
committed by GitHub
parent ac5bc615b0
commit 8958217ad5
2 changed files with 18 additions and 3 deletions

View File

@@ -25,7 +25,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
LayerBlockType, LazyLoader, cdiv,
LayerBlockType, LazyLoader, cdiv, check_use_alibi,
is_pin_memory_available)
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
@@ -223,6 +223,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
device="cpu",
pin_memory=self.pin_memory)
# Only relevant for models using ALiBi (e.g, MPT)
self.use_alibi = check_use_alibi(model_config)
self.inputs_embeds = torch.zeros(
(self.max_num_tokens, self.hidden_size),
dtype=self.dtype,
@@ -689,7 +692,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
query_lens=num_scheduled_tokens,
num_query_heads=self.num_query_heads,
num_kv_heads=self.num_kv_heads,
use_alibi=False, # FIXME
use_alibi=self.use_alibi,
use_sliding_window=self.window_size is not None,
num_sms=self.num_sms,
)