[v1] Add Whisper model support (encoder-decoder) (#21088)
Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -61,12 +61,16 @@ from vllm.v1.attention.backends.utils import (
|
||||
create_fast_prefill_custom_backend,
|
||||
reorder_batch_to_split_decodes_and_prefills)
|
||||
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.v1.kv_cache_interface import (AttentionSpec,
|
||||
ChunkedLocalAttentionSpec,
|
||||
CrossAttentionSpec,
|
||||
EncoderOnlyAttentionSpec,
|
||||
FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, KVCacheSpec,
|
||||
MambaSpec, SlidingWindowSpec)
|
||||
# yapf: enable
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||
DraftTokenIds, LogprobsLists, LogprobsTensors,
|
||||
ModelRunnerOutput, SamplerOutput)
|
||||
@@ -208,6 +212,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
|
||||
model_config)
|
||||
|
||||
if self.model_config.is_encoder_decoder:
|
||||
# Maximum length of the encoder input, only for encoder-decoder
|
||||
# models.
|
||||
self.max_encoder_len = self.mm_registry.\
|
||||
get_encdec_max_encoder_len(model_config)
|
||||
else:
|
||||
self.max_encoder_len = 0
|
||||
|
||||
# Sampler
|
||||
self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
|
||||
|
||||
@@ -265,7 +277,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# the block_sizes in the kv cache config.
|
||||
self.input_batch = InputBatch(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
max_model_len=self.max_model_len,
|
||||
# We need to use the encoder length for encoder-decoer
|
||||
# because of KV cache for cross-attention.
|
||||
max_model_len=max(self.max_model_len, self.max_encoder_len),
|
||||
max_num_batched_tokens=self.max_num_tokens,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
@@ -798,6 +812,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
src=self.input_batch.prev_sampled_token_ids[
|
||||
prev_common_req_indices_tensor, 0])
|
||||
|
||||
def _get_encoder_seq_lens(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
num_reqs: int,
|
||||
) -> Optional[np.ndarray]:
|
||||
if not isinstance(kv_cache_spec, CrossAttentionSpec):
|
||||
return None
|
||||
|
||||
# Build encoder_seq_lens array mapping request indices to
|
||||
# encoder lengths for inputs scheduled in this batch
|
||||
encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32)
|
||||
for req_id in scheduler_output.scheduled_encoder_inputs:
|
||||
req_index = self.input_batch.req_id_to_index[req_id]
|
||||
encoder_seq_lens[req_index] = self.max_encoder_len
|
||||
|
||||
return encoder_seq_lens
|
||||
|
||||
def _prepare_inputs(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
@@ -937,6 +969,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# in the same group share the same metadata.
|
||||
for kv_cache_group_id, kv_cache_group_spec in enumerate(
|
||||
self.kv_cache_config.kv_cache_groups):
|
||||
encoder_seq_lens = self._get_encoder_seq_lens(
|
||||
scheduler_output, kv_cache_group_spec.kv_cache_spec, num_reqs)
|
||||
|
||||
if isinstance(kv_cache_group_spec.kv_cache_spec,
|
||||
EncoderOnlyAttentionSpec):
|
||||
@@ -981,6 +1015,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
logits_indices_padded=logits_indices_padded,
|
||||
num_logits_indices=logits_indices.size(0),
|
||||
causal=True,
|
||||
encoder_seq_lens=encoder_seq_lens,
|
||||
)
|
||||
|
||||
if self.speculative_config and \
|
||||
@@ -1253,10 +1288,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded])
|
||||
return logits_indices_padded
|
||||
|
||||
def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
|
||||
def _batch_mm_kwargs_from_scheduler(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
|
||||
"""Batch multimodal kwargs from scheduled encoder inputs.
|
||||
|
||||
Args:
|
||||
scheduler_output: The scheduler output containing scheduled encoder
|
||||
inputs.
|
||||
|
||||
Returns:
|
||||
A tuple of (mm_kwargs, req_ids_pos) where:
|
||||
- mm_kwargs: List of multimodal kwargs items to be batched
|
||||
- mm_hashes_pos: List of (mm_hash, position_info) tuples
|
||||
"""
|
||||
scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
|
||||
if not scheduled_encoder_inputs:
|
||||
return
|
||||
return [], []
|
||||
# Batch the multi-modal inputs.
|
||||
mm_kwargs = list[MultiModalKwargsItem]()
|
||||
# list of tuple (mm_hash, position_info)
|
||||
@@ -1270,6 +1319,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
mm_hashes_pos.append(
|
||||
(mm_hash, req_state.mm_positions[mm_input_id]))
|
||||
|
||||
return mm_kwargs, mm_hashes_pos
|
||||
|
||||
def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
|
||||
# Batch the multi-modal inputs using the helper method.
|
||||
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
|
||||
scheduler_output)
|
||||
|
||||
if not mm_kwargs:
|
||||
return
|
||||
|
||||
# Batch mm inputs as much as we can: if a request in the batch has
|
||||
# multiple modalities or a different modality than the previous one,
|
||||
# we process it separately to preserve item order.
|
||||
@@ -1360,6 +1419,35 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
mm_embeds.append(mm_embeds_item)
|
||||
return mm_embeds
|
||||
|
||||
def _extract_encoder_inputs(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Extract encoder inputs for encoder-decoder models.
|
||||
|
||||
This method extracts multimodal input features from scheduled encoder
|
||||
inputs and formats them for the encoder-decoder model forward pass.
|
||||
"""
|
||||
# Batch the multi-modal inputs using the helper method.
|
||||
mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output)
|
||||
|
||||
if not mm_kwargs:
|
||||
return {}
|
||||
|
||||
# Group MM kwargs by modality and extract features
|
||||
encoder_features = {}
|
||||
for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
|
||||
mm_kwargs,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
):
|
||||
# Add the grouped features to encoder_features dict
|
||||
# This allows the model to receive them as kwargs (e.g.,
|
||||
# input_features=...)
|
||||
encoder_features.update(mm_kwargs_group)
|
||||
|
||||
return encoder_features
|
||||
|
||||
def get_model(self) -> nn.Module:
|
||||
# get raw model out of the cudagraph wrapper.
|
||||
if isinstance(self.model, CUDAGraphWrapper):
|
||||
@@ -1631,7 +1719,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
# _prepare_inputs may reorder the batch, so we must gather multi
|
||||
# modal outputs after that to ensure the correct order
|
||||
if self.supports_mm_inputs and get_pp_group().is_first_rank:
|
||||
if (self.supports_mm_inputs and get_pp_group().is_first_rank
|
||||
and not self.model_config.is_encoder_decoder):
|
||||
# Run the multimodal encoder if any.
|
||||
self._execute_mm_encoder(scheduler_output)
|
||||
mm_embeds = self._gather_mm_embeddings(scheduler_output)
|
||||
@@ -1673,6 +1762,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
intermediate_tensors = self.sync_and_slice_intermediate_tensors(
|
||||
num_input_tokens, intermediate_tensors, True)
|
||||
|
||||
if (self.model_config.is_encoder_decoder
|
||||
and scheduler_output.scheduled_encoder_inputs):
|
||||
encoder_inputs = self._extract_encoder_inputs(scheduler_output)
|
||||
model_kwargs.update(encoder_inputs)
|
||||
|
||||
return (
|
||||
num_scheduled_tokens,
|
||||
num_input_tokens,
|
||||
@@ -2591,17 +2685,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
with self.maybe_dummy_run_with_lora(self.lora_config,
|
||||
num_scheduled_tokens, remove_lora):
|
||||
if self.supports_mm_inputs:
|
||||
model_kwargs = self._init_model_kwargs(num_tokens)
|
||||
if (self.supports_mm_inputs
|
||||
and not self.model_config.is_encoder_decoder):
|
||||
input_ids = None
|
||||
inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
|
||||
model_kwargs = {
|
||||
**self._init_model_kwargs(num_tokens),
|
||||
**model_kwargs,
|
||||
**self._dummy_mm_kwargs(num_reqs),
|
||||
}
|
||||
else:
|
||||
input_ids = self.input_ids.gpu[:num_tokens]
|
||||
inputs_embeds = None
|
||||
model_kwargs = self._init_model_kwargs(num_tokens)
|
||||
|
||||
if self.uses_mrope:
|
||||
positions = self.mrope_positions.gpu[:, :num_tokens]
|
||||
@@ -2823,7 +2918,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
mm_budget = self.mm_budget
|
||||
assert mm_budget is not None
|
||||
|
||||
# TODO: handle encoder-decoder models once we support them.
|
||||
if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
|
||||
# NOTE: Currently model is profiled with a single non-text
|
||||
# modality with the max possible input tokens even when
|
||||
@@ -3170,7 +3264,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
"for more details.")
|
||||
self.input_batch = InputBatch(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
max_model_len=self.max_model_len,
|
||||
max_model_len=max(self.max_model_len, self.max_encoder_len),
|
||||
max_num_batched_tokens=self.max_num_tokens,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
@@ -3443,7 +3537,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
|
||||
for layer_name, attn_module in attn_layers.items():
|
||||
if attn_module.attn_type == AttentionType.ENCODER_ONLY:
|
||||
attn_spec = EncoderOnlyAttentionSpec(
|
||||
attn_spec: AttentionSpec = EncoderOnlyAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=attn_module.num_kv_heads,
|
||||
head_size=attn_module.head_size,
|
||||
@@ -3485,7 +3579,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
|
||||
continue
|
||||
|
||||
# TODO: Support other attention modules, e.g., cross-attention
|
||||
# TODO(lucas): move the attention specs into the model layers like
|
||||
# the attention backends
|
||||
if attn_module.attn_type == AttentionType.DECODER:
|
||||
@@ -3513,12 +3606,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
head_size=attn_module.head_size,
|
||||
dtype=self.kv_cache_dtype,
|
||||
use_mla=use_mla)
|
||||
elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
|
||||
kv_cache_spec[layer_name] = CrossAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=attn_module.num_kv_heads,
|
||||
head_size=attn_module.head_size,
|
||||
dtype=self.kv_cache_dtype,
|
||||
use_mla=use_mla)
|
||||
elif attn_module.attn_type in (AttentionType.ENCODER,
|
||||
AttentionType.ENCODER_ONLY):
|
||||
# encoder-only attention does not need KV cache.
|
||||
continue
|
||||
elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown attention type: {attn_module.attn_type}")
|
||||
|
||||
Reference in New Issue
Block a user