EVS Support (Video tokens pruning) (#22980)

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-26 06:54:54 +03:00
parent 983056e456
commit 392edee34a
8 changed files with 783 additions and 39 deletions
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -78,6 +78,11 @@ class MultiModalConfig:
    This reduces engine startup time but shifts the responsibility to users for
    estimating the peak memory usage of the activation of multimodal encoder and
    embedding cache."""
+    video_pruning_rate: Optional[float] = None
+    """Sets pruning rate for video pruning via Efficient Video Sampling.
+    Value sits in range [0;1) and determines fraction of media tokens
+    from each video to be pruned.
+    """

    def compute_hash(self) -> str:
        """
@@ -118,3 +123,7 @@ class MultiModalConfig:
        """
        kwargs = self.mm_processor_kwargs or {}
        return kwargs | dict(inference_kwargs)
+
+    def is_multimodal_pruning_enabled(self):
+        return (self.video_pruning_rate is not None
+                and self.video_pruning_rate > 0)