[model] support qwen2audio embedding input (#23625)
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -47,7 +47,7 @@ from vllm.model_executor.models.qwen2_5_vl import (
|
||||
Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
|
||||
Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
|
||||
from vllm.model_executor.models.qwen2_audio import (
|
||||
Qwen2AudioInputs, Qwen2AudioProcessingInfo,
|
||||
Qwen2AudioFeatureInputs, Qwen2AudioProcessingInfo,
|
||||
_get_feat_extract_output_lengths)
|
||||
from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
@@ -534,7 +534,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
|
||||
return torch.concat(mm_input, dim=dim)
|
||||
|
||||
def _parse_and_validate_audio_input(
|
||||
self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
|
||||
self, **kwargs: object) -> Optional[Qwen2AudioFeatureInputs]:
|
||||
input_audio_features = kwargs.pop('input_audio_features', None)
|
||||
audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
|
||||
feature_attention_mask = kwargs.pop('feature_attention_mask', None)
|
||||
@@ -548,9 +548,10 @@ class Qwen2_5OmniConditionalGenerationMixin:
|
||||
if not isinstance(input_audio_features, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of audio input features. "
|
||||
f"Got type: {type(input_audio_features)}")
|
||||
return Qwen2AudioInputs(input_features=input_audio_features,
|
||||
audio_feature_lengths=audio_feature_lengths,
|
||||
feature_attention_mask=feature_attention_mask)
|
||||
return Qwen2AudioFeatureInputs(
|
||||
input_features=input_audio_features,
|
||||
audio_feature_lengths=audio_feature_lengths,
|
||||
feature_attention_mask=feature_attention_mask)
|
||||
|
||||
def _parse_and_validate_image_input(
|
||||
self,
|
||||
@@ -630,7 +631,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
|
||||
|
||||
def _process_audio_input(
|
||||
self,
|
||||
audio_input: Qwen2AudioInputs,
|
||||
audio_input: Qwen2AudioFeatureInputs,
|
||||
audio_hashes: list[str] = None,
|
||||
cached_audio_features: torch.Tensor = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
Reference in New Issue
Block a user