diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index 1b9b00732..e5ba6ae24 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -48,7 +48,7 @@ def test_profiling(model_id: str, max_model_len: int): ) # image start, image, image end assert total_num_patches == sum( - item.get_num_embeds for item in mm_inputs["mm_placeholders"]["image"] + item.get_num_embeds() for item in mm_inputs["mm_placeholders"]["image"] ) assert total_tokens == sum( placeholder.length for placeholder in mm_inputs["mm_placeholders"]["image"] diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py index 7378c1493..d6bdf76a6 100644 --- a/tests/multimodal/test_inputs.py +++ b/tests/multimodal/test_inputs.py @@ -19,7 +19,7 @@ from vllm.multimodal.inputs import PlaceholderRange def test_placeholder_range_get_num_embeds(is_embed, expected): length = len(is_embed) if is_embed is not None else 5 pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed) - assert pr.get_num_embeds == expected + assert pr.get_num_embeds() == expected @pytest.mark.parametrize( diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py index f82c0070c..283b74624 100644 --- a/tests/v1/core/test_encoder_cache_manager.py +++ b/tests/v1/core/test_encoder_cache_manager.py @@ -187,7 +187,7 @@ def test_schedule_request_multi_images_respect_compute_limit(): def test_encoder_cache_with_is_embed_mask(): class MockRequestWithMask(MockRequest): def get_num_encoder_embeds(self, input_id: int) -> int: - return self.mm_features[input_id].mm_position.get_num_embeds + return self.mm_features[input_id].mm_position.get_num_embeds() is_embed = torch.zeros(100, dtype=torch.bool) is_embed[torch.tensor([5, 15, 25, 35, 45, 55, 65, 75])] = True @@ -207,7 +207,7 @@ def test_encoder_cache_with_is_embed_mask(): assert "img1" in manager.cached old_size = 100 - new_size = request.mm_features[0].mm_position.get_num_embeds + new_size = request.mm_features[0].mm_position.get_num_embeds() assert new_size == 8 savings_ratio = old_size / new_size assert savings_ratio == 12.5 @@ -216,7 +216,7 @@ def test_encoder_cache_with_is_embed_mask(): def test_encoder_cache_mask_based_retrieval(): class MockRequestWithMask(MockRequest): def get_num_encoder_embeds(self, input_id: int) -> int: - return self.mm_features[input_id].mm_position.get_num_embeds + return self.mm_features[input_id].mm_position.get_num_embeds() is_embed = torch.tensor( [False, False, True, True, False, True, True, True, False, False] @@ -233,7 +233,7 @@ def test_encoder_cache_mask_based_retrieval(): manager = EncoderCacheManager(cache_size=50) manager.allocate(request, 0) - assert request.mm_features[0].mm_position.get_num_embeds == 5 + assert request.mm_features[0].mm_position.get_num_embeds() == 5 start_idx = 2 end_idx = 8 diff --git a/vllm/multimodal/budget.py b/vllm/multimodal/budget.py index 1380ec1ba..1dddc82b1 100644 --- a/vllm/multimodal/budget.py +++ b/vllm/multimodal/budget.py @@ -33,7 +33,7 @@ def get_mm_max_toks_per_item( ) return { - modality: sum(item.get_num_embeds for item in placeholders) + modality: sum(item.get_num_embeds() for item in placeholders) for modality, placeholders in mm_inputs["mm_placeholders"].items() } diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index a3f8b21c2..221baba6d 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -199,7 +199,6 @@ class PlaceholderRange: def embeds_cumsum(self) -> torch.Tensor | None: return None if self.is_embed is None else self.is_embed.cumsum(dim=0) - @cached_property def get_num_embeds(self) -> int: if self.embeds_cumsum is None: return self.length diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 9f0643e4f..0b8832c16 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1100,7 +1100,7 @@ class Scheduler(SchedulerInterface): for i, mm_feature in enumerate(mm_features): start_pos = mm_feature.mm_position.offset num_encoder_tokens = mm_feature.mm_position.length - num_encoder_embeds = mm_feature.mm_position.get_num_embeds + num_encoder_embeds = mm_feature.mm_position.get_num_embeds() item_identifier = mm_feature.identifier # The encoder output is needed if the two ranges overlap: diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 64dc1831b..98c3a07d1 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -786,7 +786,7 @@ class InputProcessor: decoder_mm_positions = prompt_inputs["mm_placeholders"] for modality, mm_positions in decoder_mm_positions.items(): for mm_position in mm_positions: - embed_length = mm_position.get_num_embeds + embed_length = mm_position.get_num_embeds() if embed_length > self.mm_encoder_cache_size: raise ValueError( f"The {prompt_type} prompt contains a(n) {modality} item " diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 8e3684d3c..3b829875f 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -260,7 +260,7 @@ class Request: def get_num_encoder_embeds(self, input_id: int) -> int: assert input_id < len(self.mm_features) - return self.mm_features[input_id].mm_position.get_num_embeds + return self.mm_features[input_id].mm_position.get_num_embeds() def record_event( self, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ec36e1591..2ca4866d9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2326,7 +2326,7 @@ class GPUModelRunner( # Prefer pos_info.get_num_embeds to count precise MM embedding tokens. num_tokens = self.model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] - pos_info.get_num_embeds + pos_info.get_num_embeds() ) prompt_lora_mapping.append(lora_id) token_lora_mapping.extend([lora_id] * num_tokens)