[Misc] Make PlaceholderRange.get_num_embeds a method (#34035)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-07 13:30:17 +08:00
parent bc32444b23
commit 48312e579a
9 changed files with 11 additions and 12 deletions
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -48,7 +48,7 @@ def test_profiling(model_id: str, max_model_len: int):
    )  # image start, image, image end

    assert total_num_patches == sum(
-        item.get_num_embeds for item in mm_inputs["mm_placeholders"]["image"]
+        item.get_num_embeds() for item in mm_inputs["mm_placeholders"]["image"]
    )
    assert total_tokens == sum(
        placeholder.length for placeholder in mm_inputs["mm_placeholders"]["image"]
--- a/tests/multimodal/test_inputs.py
+++ b/tests/multimodal/test_inputs.py
@@ -19,7 +19,7 @@ from vllm.multimodal.inputs import PlaceholderRange
 def test_placeholder_range_get_num_embeds(is_embed, expected):
    length = len(is_embed) if is_embed is not None else 5
    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
-    assert pr.get_num_embeds == expected
+    assert pr.get_num_embeds() == expected


@pytest.mark.parametrize(
--- a/tests/v1/core/test_encoder_cache_manager.py
+++ b/tests/v1/core/test_encoder_cache_manager.py
@@ -187,7 +187,7 @@ def test_schedule_request_multi_images_respect_compute_limit():
 def test_encoder_cache_with_is_embed_mask():
    class MockRequestWithMask(MockRequest):
        def get_num_encoder_embeds(self, input_id: int) -> int:
-            return self.mm_features[input_id].mm_position.get_num_embeds
+            return self.mm_features[input_id].mm_position.get_num_embeds()

    is_embed = torch.zeros(100, dtype=torch.bool)
    is_embed[torch.tensor([5, 15, 25, 35, 45, 55, 65, 75])] = True
@@ -207,7 +207,7 @@ def test_encoder_cache_with_is_embed_mask():
    assert "img1" in manager.cached

    old_size = 100
-    new_size = request.mm_features[0].mm_position.get_num_embeds
+    new_size = request.mm_features[0].mm_position.get_num_embeds()
    assert new_size == 8
    savings_ratio = old_size / new_size
    assert savings_ratio == 12.5
@@ -216,7 +216,7 @@ def test_encoder_cache_with_is_embed_mask():
 def test_encoder_cache_mask_based_retrieval():
    class MockRequestWithMask(MockRequest):
        def get_num_encoder_embeds(self, input_id: int) -> int:
-            return self.mm_features[input_id].mm_position.get_num_embeds
+            return self.mm_features[input_id].mm_position.get_num_embeds()

    is_embed = torch.tensor(
        [False, False, True, True, False, True, True, True, False, False]
@@ -233,7 +233,7 @@ def test_encoder_cache_mask_based_retrieval():
    manager = EncoderCacheManager(cache_size=50)
    manager.allocate(request, 0)

-    assert request.mm_features[0].mm_position.get_num_embeds == 5
+    assert request.mm_features[0].mm_position.get_num_embeds() == 5

    start_idx = 2
    end_idx = 8
--- a/vllm/multimodal/budget.py
+++ b/vllm/multimodal/budget.py
@@ -33,7 +33,7 @@ def get_mm_max_toks_per_item(
    )

    return {
-        modality: sum(item.get_num_embeds for item in placeholders)
+        modality: sum(item.get_num_embeds() for item in placeholders)
        for modality, placeholders in mm_inputs["mm_placeholders"].items()
    }

--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -199,7 +199,6 @@ class PlaceholderRange:
    def embeds_cumsum(self) -> torch.Tensor | None:
        return None if self.is_embed is None else self.is_embed.cumsum(dim=0)

-    @cached_property
    def get_num_embeds(self) -> int:
        if self.embeds_cumsum is None:
            return self.length
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1100,7 +1100,7 @@ class Scheduler(SchedulerInterface):
        for i, mm_feature in enumerate(mm_features):
            start_pos = mm_feature.mm_position.offset
            num_encoder_tokens = mm_feature.mm_position.length
-            num_encoder_embeds = mm_feature.mm_position.get_num_embeds
+            num_encoder_embeds = mm_feature.mm_position.get_num_embeds()
            item_identifier = mm_feature.identifier

            # The encoder output is needed if the two ranges overlap:
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -786,7 +786,7 @@ class InputProcessor:
            decoder_mm_positions = prompt_inputs["mm_placeholders"]
            for modality, mm_positions in decoder_mm_positions.items():
                for mm_position in mm_positions:
-                    embed_length = mm_position.get_num_embeds
+                    embed_length = mm_position.get_num_embeds()
                    if embed_length > self.mm_encoder_cache_size:
                        raise ValueError(
                            f"The {prompt_type} prompt contains a(n) {modality} item "
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -260,7 +260,7 @@ class Request:

    def get_num_encoder_embeds(self, input_id: int) -> int:
        assert input_id < len(self.mm_features)
-        return self.mm_features[input_id].mm_position.get_num_embeds
+        return self.mm_features[input_id].mm_position.get_num_embeds()

    def record_event(
        self,
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2326,7 +2326,7 @@ class GPUModelRunner(

                # Prefer pos_info.get_num_embeds to count precise MM embedding tokens.
                num_tokens = self.model.get_num_mm_encoder_tokens(  # type: ignore[attr-defined]
-                    pos_info.get_num_embeds
+                    pos_info.get_num_embeds()
                )
                prompt_lora_mapping.append(lora_id)
                token_lora_mapping.extend([lora_id] * num_tokens)