diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index a405d8eb4..28978693c 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -64,6 +64,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .minicpmv import ( _MAX_FRAMES_PER_VIDEO, MiniCPMV2_6, + MiniCPMV4_5, MiniCPMVDummyInputsBuilder, MiniCPMVMultiModalDataParser, MiniCPMVMultiModalProcessor, @@ -197,7 +198,9 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo): ) def get_default_audio_pool_step(self) -> int: - return 2 + hf_config = self.get_hf_config() + # MiniCPM-o 4.5 uses pool_step=5, older versions use 2 + return getattr(hf_config, "audio_pool_step", 2) def get_default_audio_sampling_rate(self) -> int: return 16000 @@ -521,12 +524,9 @@ class MiniCPMWhisperEncoder(WhisperEncoder): ) -@MULTIMODAL_REGISTRY.register_processor( - MiniCPMOMultiModalProcessor, - info=MiniCPMOProcessingInfo, - dummy_inputs=MiniCPMODummyInputsBuilder, -) -class MiniCPMO(MiniCPMV2_6): +class MiniCPMOBaseModel: + """Base mixin class for MiniCPM-O models with audio support.""" + packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -767,3 +767,82 @@ class MiniCPMO(MiniCPMV2_6): multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings + + +class MiniCPMO2_6(MiniCPMOBaseModel, MiniCPMV2_6): + """MiniCPM-O 2.6 model with Qwen2 backbone.""" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + with self._mark_tower_model(vllm_config, "audio"): + self.apm = self.init_audio_module( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "apm") + ) + + +class MiniCPMO4_5(MiniCPMOBaseModel, MiniCPMV4_5): + """MiniCPM-O 4.5 model with Qwen3 backbone.""" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + with self._mark_tower_model(vllm_config, "audio"): + self.apm = self.init_audio_module( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "apm") + ) + + +_MINICPMO_SUPPORT_VERSION = { + (2, 6): MiniCPMO2_6, + (4, 5): MiniCPMO4_5, +} + + +@MULTIMODAL_REGISTRY.register_processor( + MiniCPMOMultiModalProcessor, + info=MiniCPMOProcessingInfo, + dummy_inputs=MiniCPMODummyInputsBuilder, +) +class MiniCPMO(MiniCPMOBaseModel, MiniCPMV2_6): + """ + MiniCPM-O model with audio support. + Different versions use different LLM backbones: + - Version 2.6: Uses Qwen2 + - Version 4.5: Uses Qwen3 + """ + + def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + + # Determine version from config + if hasattr(config, "version"): + try: + version_str = str(config.version) + version_parts = version_str.split(".") + version = tuple(int(x) for x in version_parts[:2]) + except (ValueError, TypeError) as e: + raise ValueError( + f"Invalid model version format in config: {config.version}. " + "Expected a dot-separated version string like '4.5'." + ) from e + else: + # Default to 2.6 for backward compatibility + version = (2, 6) + + # Dispatch class based on version + instance_cls = _MINICPMO_SUPPORT_VERSION.get(version) + if instance_cls is None: + supported_versions = ", ".join( + [f"{v[0]}.{v[1]}" for v in sorted(_MINICPMO_SUPPORT_VERSION.keys())] + ) + raise ValueError( + f"Currently, MiniCPMO only supports versions " + f"{supported_versions}. Got version: {version}" + ) + + return instance_cls(vllm_config=vllm_config, prefix=prefix) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + # This __init__ won't be called due to __new__ returning a different class + pass