[Fix] Introduce audio channels spec (#31595)
Signed-off-by: Jeremy Teboul <jeremyte@meta.com>
This commit is contained in:
@@ -226,6 +226,10 @@ class Qwen2_5OmniThinkerProcessingInfo(
|
||||
assert isinstance(feature_extractor, WhisperFeatureExtractor)
|
||||
return feature_extractor
|
||||
|
||||
def get_target_channels(self) -> int:
|
||||
"""Return target audio channels for Qwen2.5 Omni models (mono)."""
|
||||
return 1
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
return {"audio": None, "image": None, "video": None}
|
||||
|
||||
@@ -310,6 +314,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
return Qwen2_5OmniThinkerMultiModalDataParser(
|
||||
spatial_merge_size=self.info.get_hf_config().vision_config.spatial_merge_size,
|
||||
target_sr=feature_extractor.sampling_rate,
|
||||
target_channels=self.info.get_target_channels(),
|
||||
)
|
||||
|
||||
def _call_hf_processor(
|
||||
|
||||
@@ -140,6 +140,10 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
|
||||
assert isinstance(feature_extractor, WhisperFeatureExtractor)
|
||||
return feature_extractor
|
||||
|
||||
def get_target_channels(self) -> int:
|
||||
"""Return target audio channels for Qwen2 Audio models (mono)."""
|
||||
return 1
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
return {"audio": None}
|
||||
|
||||
@@ -201,7 +205,10 @@ class Qwen2AudioMultiModalDataParser(MultiModalDataParser):
|
||||
class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
|
||||
def _get_data_parser(self) -> MultiModalDataParser:
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
return Qwen2AudioMultiModalDataParser(target_sr=feature_extractor.sampling_rate)
|
||||
return Qwen2AudioMultiModalDataParser(
|
||||
target_sr=feature_extractor.sampling_rate,
|
||||
target_channels=self.info.get_target_channels(),
|
||||
)
|
||||
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
|
||||
@@ -133,6 +133,10 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
|
||||
assert isinstance(feature_extractor, WhisperFeatureExtractor)
|
||||
return feature_extractor
|
||||
|
||||
def get_target_channels(self) -> int:
|
||||
"""Return target audio channels for Ultravox models (mono)."""
|
||||
return 1
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
return {"audio": None}
|
||||
|
||||
@@ -169,7 +173,10 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo])
|
||||
class UltravoxMultiModalProcessor(BaseMultiModalProcessor[UltravoxProcessingInfo]):
|
||||
def _get_data_parser(self) -> MultiModalDataParser:
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
|
||||
return MultiModalDataParser(
|
||||
target_sr=feature_extractor.sampling_rate,
|
||||
target_channels=self.info.get_target_channels(),
|
||||
)
|
||||
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
|
||||
@@ -690,6 +690,10 @@ class WhisperProcessingInfo(BaseProcessingInfo):
|
||||
assert isinstance(feature_extractor, WhisperFeatureExtractor)
|
||||
return feature_extractor
|
||||
|
||||
def get_target_channels(self) -> int:
|
||||
"""Return target audio channels for Whisper models (mono)."""
|
||||
return 1
|
||||
|
||||
def get_num_audio_tokens(self) -> int:
|
||||
return self.get_hf_config().max_source_positions
|
||||
|
||||
@@ -724,7 +728,10 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
|
||||
class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo]):
|
||||
def _get_data_parser(self) -> MultiModalDataParser:
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
|
||||
return MultiModalDataParser(
|
||||
target_sr=feature_extractor.sampling_rate,
|
||||
target_channels=self.info.get_target_channels(),
|
||||
)
|
||||
|
||||
@property
|
||||
def pad_dummy_encoder_prompt(self) -> bool:
|
||||
|
||||
Reference in New Issue
Block a user