[Bugfix] Fix InternS1 video processing after Transformers v4.56 (#25644)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-25 22:46:04 +08:00
parent 532a6cfccb
commit 03858e6d1c
4 changed files with 68 additions and 3 deletions
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -5,10 +5,11 @@ from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Optional, Union, cast

 from transformers import (AutoFeatureExtractor, AutoImageProcessor,
-                          AutoProcessor)
+                          AutoProcessor, AutoVideoProcessor)
 from transformers.feature_extraction_utils import FeatureExtractionMixin
 from transformers.image_processing_utils import BaseImageProcessor
 from transformers.processing_utils import ProcessorMixin
+from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar

 from vllm.utils import get_allowed_kwarg_only_overrides
@@ -17,6 +18,7 @@ if TYPE_CHECKING:
    from vllm.config import ModelConfig

 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
+_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)


 class HashableDict(dict):
@@ -243,3 +245,55 @@ def cached_image_processor_from_config(
        trust_remote_code=model_config.trust_remote_code,
        **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
    )
+
+
+def get_video_processor(
+    processor_name: str,
+    *args: Any,
+    revision: Optional[str] = None,
+    trust_remote_code: bool = False,
+    processor_cls_overrides: Optional[type[_V]] = None,
+    **kwargs: Any,
+):
+    """Load a video processor for the given model name via HuggingFace."""
+    try:
+        processor_cls = processor_cls_overrides or AutoVideoProcessor
+        processor = processor_cls.from_pretrained(
+            processor_name,
+            *args,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoVideoProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the video processor. If the video processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    return cast(BaseVideoProcessor, processor)
+
+
+cached_get_video_processor = lru_cache(get_video_processor)
+
+
+def cached_video_processor_from_config(
+    model_config: "ModelConfig",
+    processor_cls: Optional[type[_V]] = None,
+    **kwargs: Any,
+):
+    return cached_get_video_processor(
+        model_config.model,
+        revision=model_config.revision,
+        trust_remote_code=model_config.trust_remote_code,
+        processor_cls_overrides=processor_cls,  # type: ignore[arg-type]
+        **_merge_mm_kwargs(model_config, AutoVideoProcessor, **kwargs),
+    )