diff --git a/tests/models/registry.py b/tests/models/registry.py index 4a105dedd..40c4d0d31 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1020,18 +1020,15 @@ _MULTIMODAL_EXAMPLE_MODELS = { min_transformers_version="4.57", ), "Qwen3ASRForConditionalGeneration": _HfExamplesInfo( - "Qwen/Qwen3-ASR-1.7B", + "Qwen/Qwen3-ASR-0.6B", max_model_len=4096, min_transformers_version="4.57", - is_available_online=False, ), "Qwen3ASRRealtimeGeneration": _HfExamplesInfo( - "Qwen/Qwen3-ASR-1.7B", + "Qwen/Qwen3-ASR-0.6B", max_model_len=4096, min_transformers_version="4.57", - enforce_eager=True, hf_overrides={"architectures": ["Qwen3ASRRealtimeGeneration"]}, - is_available_online=False, ), "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B", trust_remote_code=True), "SkyworkR1VChatModel": _HfExamplesInfo( diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index e0395a5b1..469d7fb71 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -48,7 +48,6 @@ from vllm.transformers_utils.configs.deepseek_vl2 import ( MlpProjectorConfig, VisionEncoderConfig, ) -from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype @@ -160,7 +159,7 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo): return self.ctx.get_hf_config(DeepseekVLV2Config) def get_hf_processor(self, **kwargs: object): - return self.ctx.get_hf_processor(DeepseekVLV2Processor, **kwargs) + return self.ctx.get_hf_processor(**kwargs) def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} diff --git a/vllm/model_executor/models/fireredasr2.py b/vllm/model_executor/models/fireredasr2.py index 981c65472..5d6c68454 100644 --- a/vllm/model_executor/models/fireredasr2.py +++ b/vllm/model_executor/models/fireredasr2.py @@ -41,7 +41,7 @@ from vllm.multimodal.processing import ( PromptUpdateDetails, ) from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.transformers_utils.processors.fireredasr2_processor import ( +from vllm.transformers_utils.processors.fireredasr2 import ( FireRedASR2FeatureExtractor, ) from vllm.utils.tensor_schema import TensorSchema, TensorShape diff --git a/vllm/model_executor/models/funasr.py b/vllm/model_executor/models/funasr.py index fd4e2c06d..ed8009011 100644 --- a/vllm/model_executor/models/funasr.py +++ b/vllm/model_executor/models/funasr.py @@ -50,7 +50,7 @@ from vllm.multimodal.processing import ( PromptUpdate, ) from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.transformers_utils.processors.funasr_processor import FunASRFeatureExtractor +from vllm.transformers_utils.processors.funasr import FunASRFeatureExtractor from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 9190c82f5..1319e2943 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -20,7 +20,9 @@ from transformers.video_processing_utils import BaseVideoProcessor from typing_extensions import TypeVar from vllm.logger import init_logger +from vllm.transformers_utils import processors from vllm.transformers_utils.gguf_utils import is_gguf +from vllm.transformers_utils.repo_utils import get_hf_file_to_dict from vllm.transformers_utils.utils import convert_model_repo_to_path from vllm.utils.func_utils import get_allowed_kwarg_only_overrides @@ -139,6 +141,22 @@ def _merge_mm_kwargs( return allowed_kwargs +def get_processor_cls_name_from_config( + processor_name: str, + revision: str | None = "main", +) -> str | None: + config_file = [ + "processor_config.json", + "preprocessor_config.json", + "tokenizer_config.json", + ] + for file in config_file: + config = get_hf_file_to_dict(file, processor_name, revision=revision) + if config and "processor_class" in config: + return config["processor_class"] + return None + + def get_processor( processor_name: str, *args: Any, @@ -152,8 +170,20 @@ def get_processor( revision = "main" try: processor_name = convert_model_repo_to_path(processor_name) + registered_cls_name = get_processor_cls_name_from_config( + processor_name, revision=revision + ) + registered_processor_cls = ( + getattr(processors, registered_cls_name, None) + if registered_cls_name + else None + ) + registered_processor_cls = cast(type[_P] | None, registered_processor_cls) + # Use registered processor class when it's available + # and explicit processor_cls is not set. if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin: - processor = AutoProcessor.from_pretrained( + _processor_cls = registered_processor_cls or AutoProcessor + processor = _processor_cls.from_pretrained( processor_name, *args, revision=revision, diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 0660a62ea..ff2263f3e 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -8,16 +8,20 @@ reasons: - There is a need to override the existing processor to support vLLM. """ -from vllm.transformers_utils.processors.bagel import BagelProcessor -from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor -from vllm.transformers_utils.processors.fireredasr2_processor import ( - FireRedASR2Processor, -) -from vllm.transformers_utils.processors.funasr_processor import FunASRProcessor -from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor -from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor -from vllm.transformers_utils.processors.ovis import OvisProcessor -from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor +import importlib + +_CLASS_TO_MODULE: dict[str, str] = { + "BagelProcessor": "vllm.transformers_utils.processors.bagel", + "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2", + "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2", + "FunASRProcessor": "vllm.transformers_utils.processors.funasr", + "HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl", + "HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image", + "OvisProcessor": "vllm.transformers_utils.processors.ovis", + "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5", + "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr", +} + __all__ = [ "BagelProcessor", @@ -28,4 +32,18 @@ __all__ = [ "HunYuanVLImageProcessor", "OvisProcessor", "Ovis2_5Processor", + "Qwen3ASRProcessor", ] + + +def __getattr__(name: str): + if name in _CLASS_TO_MODULE: + module_name = _CLASS_TO_MODULE[name] + module = importlib.import_module(module_name) + return getattr(module, name) + + raise AttributeError(f"module 'processors' has no attribute '{name}'") + + +def __dir__(): + return sorted(list(__all__)) diff --git a/vllm/transformers_utils/processors/bagel.py b/vllm/transformers_utils/processors/bagel.py index 09b2e31b3..3226d7b0c 100644 --- a/vllm/transformers_utils/processors/bagel.py +++ b/vllm/transformers_utils/processors/bagel.py @@ -3,7 +3,6 @@ # Copyright 2025 Bytedance Ltd. and/or its affiliates. """BAGEL processor for image and text inputs.""" -from transformers import AutoProcessor from transformers.feature_extraction_utils import BatchFeature from transformers.image_utils import ImageInput from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack @@ -79,6 +78,3 @@ class BagelProcessor(ProcessorMixin): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - -AutoProcessor.register("BagelProcessor", BagelProcessor) diff --git a/vllm/transformers_utils/processors/deepseek_ocr.py b/vllm/transformers_utils/processors/deepseek_ocr.py index 77e494836..68a2b1aaa 100644 --- a/vllm/transformers_utils/processors/deepseek_ocr.py +++ b/vllm/transformers_utils/processors/deepseek_ocr.py @@ -8,7 +8,7 @@ from typing import Literal import torch import torchvision.transforms as T from PIL import Image, ImageOps -from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast +from transformers import BatchFeature, LlamaTokenizerFast from transformers.processing_utils import ProcessorMixin # TODO(Isotr0py): change modes for variants @@ -453,6 +453,3 @@ class DeepseekOCRProcessor(ProcessorMixin): num_image_tokens, image_shapes, ) - - -AutoProcessor.register("DeepseekOCRProcessor", DeepseekOCRProcessor) diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index 5ef258b9b..5a3c986c1 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -29,7 +29,7 @@ from typing import Any import torch import torchvision.transforms as T from PIL import Image, ImageOps -from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast +from transformers import BatchFeature, LlamaTokenizerFast from transformers.processing_utils import ProcessorMixin @@ -401,6 +401,3 @@ class DeepseekVLV2Processor(ProcessorMixin): images_spatial_crop, num_image_tokens, ) - - -AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor) diff --git a/vllm/transformers_utils/processors/fireredasr2_processor.py b/vllm/transformers_utils/processors/fireredasr2.py similarity index 99% rename from vllm/transformers_utils/processors/fireredasr2_processor.py rename to vllm/transformers_utils/processors/fireredasr2.py index 98c99ec39..4bde53015 100644 --- a/vllm/transformers_utils/processors/fireredasr2_processor.py +++ b/vllm/transformers_utils/processors/fireredasr2.py @@ -8,7 +8,6 @@ import torch import torch.nn.functional as F from transformers import ( AutoFeatureExtractor, - AutoProcessor, BatchFeature, ) from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor @@ -345,4 +344,3 @@ class FireRedASR2Processor(ProcessorMixin): AutoFeatureExtractor.register( "FireRedASR2FeatureExtractor", FireRedASR2FeatureExtractor ) -AutoProcessor.register("FireRedASR2Processor", FireRedASR2Processor) diff --git a/vllm/transformers_utils/processors/funasr_processor.py b/vllm/transformers_utils/processors/funasr.py similarity index 99% rename from vllm/transformers_utils/processors/funasr_processor.py rename to vllm/transformers_utils/processors/funasr.py index bb6fe69ac..1ce653c2e 100644 --- a/vllm/transformers_utils/processors/funasr_processor.py +++ b/vllm/transformers_utils/processors/funasr.py @@ -9,7 +9,6 @@ import torchaudio.compliance.kaldi as kaldi from torch.nn.utils.rnn import pad_sequence from transformers import ( AutoFeatureExtractor, - AutoProcessor, BatchFeature, ) from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor @@ -503,4 +502,3 @@ class FunASRProcessor(ProcessorMixin): AutoFeatureExtractor.register("FunASRFeatureExtractor", FunASRFeatureExtractor) -AutoProcessor.register("FunASRProcessor", FunASRProcessor) diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py index 924c679e7..2d0e4db97 100644 --- a/vllm/transformers_utils/processors/hunyuan_vl.py +++ b/vllm/transformers_utils/processors/hunyuan_vl.py @@ -5,7 +5,6 @@ import numpy as np import torch -from transformers import AutoProcessor from transformers.feature_extraction_utils import BatchFeature from transformers.image_utils import ImageInput from transformers.processing_utils import ProcessorMixin @@ -225,6 +224,3 @@ def split_image_into_patch_blocks( patches = img.reshape(-1, 3, patch_size, patch_size) return patches - - -AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor) diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py index bd5de9591..da80f24e7 100644 --- a/vllm/transformers_utils/processors/ovis.py +++ b/vllm/transformers_utils/processors/ovis.py @@ -26,7 +26,7 @@ from functools import cached_property import PIL import torch -from transformers import AutoProcessor, BatchFeature +from transformers import BatchFeature from transformers.image_utils import ImageInput from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from transformers.tokenization_utils_base import PreTokenizedInput, TextInput @@ -453,6 +453,3 @@ class OvisProcessor(ProcessorMixin): dict.fromkeys(tokenizer_input_names + image_processor_input_names) ) return names_from_processor + ["second_per_grid_ts"] - - -AutoProcessor.register("OvisProcessor", OvisProcessor) diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py index 6b6fdcace..46ffd6a1e 100644 --- a/vllm/transformers_utils/processors/ovis2_5.py +++ b/vllm/transformers_utils/processors/ovis2_5.py @@ -6,7 +6,7 @@ from functools import cached_property import numpy as np import PIL import torch -from transformers import AutoProcessor, BatchFeature +from transformers import BatchFeature from transformers.image_utils import ImageInput from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from transformers.tokenization_utils_base import PreTokenizedInput, TextInput @@ -476,6 +476,3 @@ class Ovis2_5Processor(ProcessorMixin): visual_placeholders, torch.tensor([[grid_t, grid_h, grid_w]]), ) - - -AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor) diff --git a/vllm/transformers_utils/processors/qwen3_asr.py b/vllm/transformers_utils/processors/qwen3_asr.py index 677326e25..55d385379 100644 --- a/vllm/transformers_utils/processors/qwen3_asr.py +++ b/vllm/transformers_utils/processors/qwen3_asr.py @@ -227,6 +227,3 @@ class Qwen3ASRProcessor(ProcessorMixin): + ["feature_attention_mask"] ) ) - - -AutoProcessor.register("Qwen3ASRProcessor", Qwen3ASRProcessor)