[Bugfix] Whisper fix number of allocated CrossAttn blocks per-request (#30772)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -187,6 +187,12 @@ class Scheduler(SchedulerInterface):
|
|||||||
if self.is_encoder_decoder
|
if self.is_encoder_decoder
|
||||||
else EncoderCacheManager(cache_size=encoder_cache_size)
|
else EncoderCacheManager(cache_size=encoder_cache_size)
|
||||||
)
|
)
|
||||||
|
# For encoder-decoder models, allocate the maximum number of tokens for Cross
|
||||||
|
# Attn blocks, as for Whisper its input is always padded to the maximum length.
|
||||||
|
# TODO (NickLucche): Generalize to models with variable-length encoder inputs.
|
||||||
|
self._num_encoder_max_input_tokens = (
|
||||||
|
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(vllm_config.model_config)
|
||||||
|
)
|
||||||
|
|
||||||
speculative_config = vllm_config.speculative_config
|
speculative_config = vllm_config.speculative_config
|
||||||
self.use_eagle = False
|
self.use_eagle = False
|
||||||
@@ -568,17 +574,11 @@ class Scheduler(SchedulerInterface):
|
|||||||
0 if request.num_computed_tokens == 0 else self.num_lookahead_tokens
|
0 if request.num_computed_tokens == 0 else self.num_lookahead_tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
# Determine if we need to allocate cross-attention blocks.
|
|
||||||
if self.is_encoder_decoder and request.has_encoder_inputs:
|
|
||||||
# TODO(russellb): For Whisper, we know that the input is
|
|
||||||
# always padded to the maximum length. If we support other
|
|
||||||
# encoder-decoder models, this will need to be updated if we
|
|
||||||
# want to only allocate what is needed.
|
|
||||||
num_encoder_tokens = (
|
num_encoder_tokens = (
|
||||||
self.scheduler_config.max_num_encoder_input_tokens
|
self._num_encoder_max_input_tokens
|
||||||
|
if self.is_encoder_decoder and request.has_encoder_inputs
|
||||||
|
else 0
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
num_encoder_tokens = 0
|
|
||||||
|
|
||||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||||
request,
|
request,
|
||||||
|
|||||||
Reference in New Issue
Block a user