[Misc] Make PlaceholderRange.get_num_embeds a method (#34035)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -48,7 +48,7 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
) # image start, image, image end
|
||||
|
||||
assert total_num_patches == sum(
|
||||
item.get_num_embeds for item in mm_inputs["mm_placeholders"]["image"]
|
||||
item.get_num_embeds() for item in mm_inputs["mm_placeholders"]["image"]
|
||||
)
|
||||
assert total_tokens == sum(
|
||||
placeholder.length for placeholder in mm_inputs["mm_placeholders"]["image"]
|
||||
|
||||
@@ -19,7 +19,7 @@ from vllm.multimodal.inputs import PlaceholderRange
|
||||
def test_placeholder_range_get_num_embeds(is_embed, expected):
|
||||
length = len(is_embed) if is_embed is not None else 5
|
||||
pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
|
||||
assert pr.get_num_embeds == expected
|
||||
assert pr.get_num_embeds() == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@@ -187,7 +187,7 @@ def test_schedule_request_multi_images_respect_compute_limit():
|
||||
def test_encoder_cache_with_is_embed_mask():
|
||||
class MockRequestWithMask(MockRequest):
|
||||
def get_num_encoder_embeds(self, input_id: int) -> int:
|
||||
return self.mm_features[input_id].mm_position.get_num_embeds
|
||||
return self.mm_features[input_id].mm_position.get_num_embeds()
|
||||
|
||||
is_embed = torch.zeros(100, dtype=torch.bool)
|
||||
is_embed[torch.tensor([5, 15, 25, 35, 45, 55, 65, 75])] = True
|
||||
@@ -207,7 +207,7 @@ def test_encoder_cache_with_is_embed_mask():
|
||||
assert "img1" in manager.cached
|
||||
|
||||
old_size = 100
|
||||
new_size = request.mm_features[0].mm_position.get_num_embeds
|
||||
new_size = request.mm_features[0].mm_position.get_num_embeds()
|
||||
assert new_size == 8
|
||||
savings_ratio = old_size / new_size
|
||||
assert savings_ratio == 12.5
|
||||
@@ -216,7 +216,7 @@ def test_encoder_cache_with_is_embed_mask():
|
||||
def test_encoder_cache_mask_based_retrieval():
|
||||
class MockRequestWithMask(MockRequest):
|
||||
def get_num_encoder_embeds(self, input_id: int) -> int:
|
||||
return self.mm_features[input_id].mm_position.get_num_embeds
|
||||
return self.mm_features[input_id].mm_position.get_num_embeds()
|
||||
|
||||
is_embed = torch.tensor(
|
||||
[False, False, True, True, False, True, True, True, False, False]
|
||||
@@ -233,7 +233,7 @@ def test_encoder_cache_mask_based_retrieval():
|
||||
manager = EncoderCacheManager(cache_size=50)
|
||||
manager.allocate(request, 0)
|
||||
|
||||
assert request.mm_features[0].mm_position.get_num_embeds == 5
|
||||
assert request.mm_features[0].mm_position.get_num_embeds() == 5
|
||||
|
||||
start_idx = 2
|
||||
end_idx = 8
|
||||
|
||||
@@ -33,7 +33,7 @@ def get_mm_max_toks_per_item(
|
||||
)
|
||||
|
||||
return {
|
||||
modality: sum(item.get_num_embeds for item in placeholders)
|
||||
modality: sum(item.get_num_embeds() for item in placeholders)
|
||||
for modality, placeholders in mm_inputs["mm_placeholders"].items()
|
||||
}
|
||||
|
||||
|
||||
@@ -199,7 +199,6 @@ class PlaceholderRange:
|
||||
def embeds_cumsum(self) -> torch.Tensor | None:
|
||||
return None if self.is_embed is None else self.is_embed.cumsum(dim=0)
|
||||
|
||||
@cached_property
|
||||
def get_num_embeds(self) -> int:
|
||||
if self.embeds_cumsum is None:
|
||||
return self.length
|
||||
|
||||
@@ -1100,7 +1100,7 @@ class Scheduler(SchedulerInterface):
|
||||
for i, mm_feature in enumerate(mm_features):
|
||||
start_pos = mm_feature.mm_position.offset
|
||||
num_encoder_tokens = mm_feature.mm_position.length
|
||||
num_encoder_embeds = mm_feature.mm_position.get_num_embeds
|
||||
num_encoder_embeds = mm_feature.mm_position.get_num_embeds()
|
||||
item_identifier = mm_feature.identifier
|
||||
|
||||
# The encoder output is needed if the two ranges overlap:
|
||||
|
||||
@@ -786,7 +786,7 @@ class InputProcessor:
|
||||
decoder_mm_positions = prompt_inputs["mm_placeholders"]
|
||||
for modality, mm_positions in decoder_mm_positions.items():
|
||||
for mm_position in mm_positions:
|
||||
embed_length = mm_position.get_num_embeds
|
||||
embed_length = mm_position.get_num_embeds()
|
||||
if embed_length > self.mm_encoder_cache_size:
|
||||
raise ValueError(
|
||||
f"The {prompt_type} prompt contains a(n) {modality} item "
|
||||
|
||||
@@ -260,7 +260,7 @@ class Request:
|
||||
|
||||
def get_num_encoder_embeds(self, input_id: int) -> int:
|
||||
assert input_id < len(self.mm_features)
|
||||
return self.mm_features[input_id].mm_position.get_num_embeds
|
||||
return self.mm_features[input_id].mm_position.get_num_embeds()
|
||||
|
||||
def record_event(
|
||||
self,
|
||||
|
||||
@@ -2326,7 +2326,7 @@ class GPUModelRunner(
|
||||
|
||||
# Prefer pos_info.get_num_embeds to count precise MM embedding tokens.
|
||||
num_tokens = self.model.get_num_mm_encoder_tokens( # type: ignore[attr-defined]
|
||||
pos_info.get_num_embeds
|
||||
pos_info.get_num_embeds()
|
||||
)
|
||||
prompt_lora_mapping.append(lora_id)
|
||||
token_lora_mapping.extend([lora_id] * num_tokens)
|
||||
|
||||
Reference in New Issue
Block a user