[VLM] Remove BaseProcessingInfo.get_mm_max_tokens_per_item (#16408)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -408,13 +408,6 @@ class AriaProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_num_image_tokens()}
|
||||
|
||||
def get_num_image_tokens(self) -> int:
|
||||
hf_config = self.get_hf_config()
|
||||
return max(hf_config.projector_patch_to_query_dict.values())
|
||||
|
||||
@@ -117,31 +117,6 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
|
||||
def get_image_processor(self) -> GotOcr2ImageProcessor:
|
||||
return self.get_hf_processor().image_processor
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
hf_processor = self.get_hf_processor()
|
||||
image_processor = hf_processor.image_processor
|
||||
|
||||
image_size = self.get_image_size_with_most_features()
|
||||
num_patches = self.get_num_patches(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
size=image_processor.size,
|
||||
min_patches=image_processor.min_patches,
|
||||
max_patches=image_processor.max_patches,
|
||||
)
|
||||
|
||||
img_patches_per_tile = (hf_processor.img_size //
|
||||
hf_processor.patch_size)**2
|
||||
|
||||
return num_patches * img_patches_per_tile
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
|
||||
@@ -406,13 +406,6 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_num_image_tokens()}
|
||||
|
||||
def get_num_image_tokens(self) -> int:
|
||||
hf_config = self.get_hf_config()
|
||||
return hf_config.num_query_tokens
|
||||
|
||||
@@ -64,13 +64,6 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_num_image_tokens()}
|
||||
|
||||
def get_num_image_tokens(self) -> int:
|
||||
processor = self.get_hf_processor()
|
||||
return processor.image_seq_length
|
||||
|
||||
@@ -30,9 +30,6 @@ class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
|
||||
) -> int:
|
||||
return self.get_patch_grid_length()**2 + 1
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
return self.get_patch_grid_length()**2 + 1
|
||||
|
||||
def get_image_size(self) -> int:
|
||||
return self.vision_config.image_size
|
||||
|
||||
|
||||
@@ -168,20 +168,6 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
|
||||
image_width=x[1], image_height=x[0]))
|
||||
return ImageSize(width=width, height=height)
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
max_image_size = self.get_image_size_with_most_features()
|
||||
max_image_tokens = self.get_num_image_tokens(
|
||||
image_height=max_image_size.height,
|
||||
image_width=max_image_size.width,
|
||||
cropping=num_images <= 2)
|
||||
|
||||
return {"image": max_image_tokens}
|
||||
|
||||
|
||||
class DeepseekVL2DummyInputsBuilder(
|
||||
BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
|
||||
|
||||
@@ -764,17 +764,10 @@ class Florence2ProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
def get_num_image_tokens(self) -> int:
|
||||
processor_config = self.ctx.get_hf_image_processor_config()
|
||||
return processor_config["image_seq_length"]
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
|
||||
class Florence2DummyInputsBuilder(
|
||||
BaseDummyInputsBuilder[Florence2ProcessingInfo]):
|
||||
@@ -871,7 +864,7 @@ class Florence2MultiModalProcessor(
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
pad_token_id = hf_config.pad_token_id
|
||||
num_image_tokens = self.info.get_max_image_tokens()
|
||||
num_image_tokens = self.info.get_num_image_tokens()
|
||||
image_tokens = [pad_token_id] * num_image_tokens
|
||||
|
||||
return [
|
||||
|
||||
@@ -80,13 +80,6 @@ class FuyuProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def get_image_feature_grid_size(
|
||||
self,
|
||||
*,
|
||||
@@ -129,14 +122,6 @@ class FuyuProcessingInfo(BaseProcessingInfo):
|
||||
return ImageSize(width=image_processor.size["width"],
|
||||
height=image_processor.size["height"])
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
)
|
||||
|
||||
|
||||
class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
|
||||
|
||||
|
||||
@@ -68,13 +68,6 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def _resolve_image_kwargs(
|
||||
self,
|
||||
processor: Gemma3Processor,
|
||||
@@ -228,15 +221,6 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
|
||||
# Result in the max possible feature size (h:w = max_num_crops:1)
|
||||
return ImageSize(height=50 * max_num_crops, width=50)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
processor=None,
|
||||
)
|
||||
|
||||
|
||||
class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
|
||||
|
||||
|
||||
@@ -431,13 +431,6 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_num_image_feature_tokens()}
|
||||
|
||||
def get_num_image_tokens(self) -> int:
|
||||
hf_config = self.get_hf_config()
|
||||
vision_config = hf_config.vision_config
|
||||
|
||||
@@ -412,19 +412,6 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
max_tokens_one_image = self.get_max_image_tokens(use_msac=None)
|
||||
if mm_counts.get("image", 0) <= 1:
|
||||
max_tokens_per_image = max_tokens_one_image
|
||||
else:
|
||||
max_tokens_per_image = self.get_max_image_tokens(use_msac=False)
|
||||
|
||||
return {"image": max_tokens_per_image}
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
@@ -442,16 +429,6 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
use_msac=use_msac,
|
||||
)
|
||||
|
||||
def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
processor=None,
|
||||
use_msac=use_msac,
|
||||
)
|
||||
|
||||
|
||||
class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
|
||||
):
|
||||
|
||||
@@ -97,13 +97,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def _resize_output_size(self,
|
||||
*,
|
||||
height: int,
|
||||
@@ -287,15 +280,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
|
||||
height=image_processor.size["longest_edge"],
|
||||
)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
processor=None,
|
||||
)
|
||||
|
||||
|
||||
class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
|
||||
):
|
||||
|
||||
@@ -458,13 +458,6 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
@@ -480,15 +473,6 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
|
||||
image_height=image_height,
|
||||
)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
processor=None,
|
||||
)
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
processor = self.get_hf_processor()
|
||||
|
||||
|
||||
@@ -137,13 +137,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def _apply_feature_select_strategy(
|
||||
self,
|
||||
strategy: str,
|
||||
|
||||
@@ -61,22 +61,6 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"video": 1}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
max_video_tokens = self.get_num_video_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
num_frames=self.get_num_frames_with_most_features(
|
||||
seq_len, mm_counts),
|
||||
)
|
||||
|
||||
return {"video": max_video_tokens}
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
vision_encoder_info = self.get_vision_encoder_info()
|
||||
width = height = vision_encoder_info.get_image_size()
|
||||
|
||||
@@ -101,16 +101,6 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {
|
||||
"image": self.get_max_image_tokens(),
|
||||
"video": self.get_max_video_tokens(seq_len, mm_counts),
|
||||
}
|
||||
|
||||
# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
|
||||
# with additional logic afterwards taken from LlavaOnevisionProcessor
|
||||
def _get_num_unpadded_features(
|
||||
|
||||
@@ -142,17 +142,6 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {**super().get_supported_mm_limits(), "audio": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {
|
||||
**super().get_mm_max_tokens_per_item(seq_len, mm_counts),
|
||||
"audio":
|
||||
self.get_max_audio_tokens(),
|
||||
}
|
||||
|
||||
def get_audio_placeholder(
|
||||
self,
|
||||
audio_lens: int,
|
||||
|
||||
@@ -346,18 +346,6 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
return mm_limits
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
mm_max_tokens = {"image": self.get_max_image_tokens()}
|
||||
if self.get_model_version() == (2, 6):
|
||||
mm_max_tokens["video"] = self.get_max_video_tokens(
|
||||
seq_len, mm_counts)
|
||||
|
||||
return mm_max_tokens
|
||||
|
||||
def get_slice_image_placeholder(
|
||||
self,
|
||||
image_size: ImageSize,
|
||||
|
||||
@@ -162,13 +162,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
@@ -186,14 +179,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
|
||||
width = height = vision_encoder_info.get_image_size()
|
||||
return ImageSize(width=width, height=height)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
)
|
||||
|
||||
|
||||
_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
|
||||
|
||||
|
||||
@@ -106,16 +106,6 @@ class MllamaProcessingInfo(BaseProcessingInfo):
|
||||
image_size = self.get_hf_config().vision_config.image_size
|
||||
return calc_token_per_chunk(image_size)
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
vision_config = self.get_hf_config().vision_config
|
||||
token_per_chunk = self.get_token_per_chunk_from_config()
|
||||
mm_max_tokens = vision_config.max_num_tiles * token_per_chunk
|
||||
return {"image": mm_max_tokens}
|
||||
|
||||
def get_num_tiles_per_image(self, image_height: int,
|
||||
image_width: int) -> int:
|
||||
vision_config = self.get_hf_config().vision_config
|
||||
|
||||
@@ -498,17 +498,6 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
|
||||
image_processor = self.get_hf_processor().image_processor
|
||||
return image_processor.max_patches
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
vision_config = self.get_hf_config().vision_config
|
||||
patch_per_chunk = self.get_patch_per_chunk(vision_config)
|
||||
num_patches = self.get_max_num_tiles() + 1
|
||||
|
||||
return {"image": patch_per_chunk * num_patches}
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
vision_config = self.get_hf_config().vision_config
|
||||
image_size = vision_config.image_size
|
||||
@@ -516,14 +505,6 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
|
||||
return ImageSize(height=self.get_max_num_tiles() * image_size,
|
||||
width=image_size)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
)
|
||||
|
||||
|
||||
class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
|
||||
):
|
||||
|
||||
@@ -1164,13 +1164,6 @@ class MolmoProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
@@ -1195,15 +1188,6 @@ class MolmoProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
return extra + joint
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
processor=None,
|
||||
)
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
processor = self.get_hf_processor()
|
||||
|
||||
|
||||
@@ -13,7 +13,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, MultiModalKwargs)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptIndexTargets,
|
||||
PromptInsertion, PromptUpdate,
|
||||
@@ -72,16 +73,18 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_num_image_tokens()}
|
||||
|
||||
def get_num_image_tokens(self) -> int:
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
vision_encoder_info = self.get_vision_encoder_info()
|
||||
return vision_encoder_info.get_max_image_tokens()
|
||||
|
||||
return vision_encoder_info.get_num_image_tokens(
|
||||
image_width=image_width,
|
||||
image_height=image_height,
|
||||
)
|
||||
|
||||
|
||||
class PaliGemmaDummyInputsBuilder(
|
||||
@@ -148,12 +151,30 @@ class PaliGemmaMultiModalProcessor(
|
||||
image_token_id = hf_config.image_token_index
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
num_image_tokens = self.info.get_num_image_tokens()
|
||||
image_tokens = [image_token_id] * num_image_tokens
|
||||
|
||||
bos_token_id = tokenizer.bos_token_id
|
||||
assert isinstance(bos_token_id, int)
|
||||
|
||||
def get_insertion(item_idx: int):
|
||||
images = mm_items.get_items(
|
||||
"image", (ImageEmbeddingItems, ImageProcessorItems))
|
||||
|
||||
if isinstance(images, ImageEmbeddingItems):
|
||||
num_image_tokens = images.get_feature_size(item_idx)
|
||||
else:
|
||||
image_size = images.get_image_size(item_idx)
|
||||
num_image_tokens = self.info.get_num_image_tokens(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
|
||||
image_tokens = [image_token_id] * num_image_tokens
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=image_token_id,
|
||||
)
|
||||
|
||||
# Paligemma 1 and 2 have different tokenizer.add_bos_token
|
||||
# Insert <image>*n + <bos> after <bos> for Paligemma 1
|
||||
# Insert <image>*n + <bos> for Paligemma 2
|
||||
@@ -162,10 +183,7 @@ class PaliGemmaMultiModalProcessor(
|
||||
modality="image",
|
||||
target=PromptIndexTargets.prefix(
|
||||
[bos_token_id] if tokenizer.add_bos_token else []),
|
||||
insertion=PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=image_token_id,
|
||||
),
|
||||
insertion=get_insertion,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@@ -321,21 +321,6 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
max_image_tokens = self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
processor=None,
|
||||
)
|
||||
|
||||
return {"image": max_image_tokens}
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
|
||||
@@ -167,13 +167,6 @@ class PixtralProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def get_vision_config(
|
||||
self,
|
||||
processor: Optional[PixtralProcessorAdapter] = None,
|
||||
@@ -207,14 +200,6 @@ class PixtralProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
return ImageSize(width=max_image_size, height=max_image_size)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
)
|
||||
|
||||
|
||||
class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
|
||||
|
||||
@@ -938,14 +923,6 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
|
||||
)
|
||||
return ncols * nrows
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
image_size = self.get_image_size()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=image_size,
|
||||
image_height=image_size,
|
||||
)
|
||||
|
||||
def get_image_size(self) -> int:
|
||||
return self.vision_config.image_size
|
||||
|
||||
|
||||
@@ -45,9 +45,6 @@ class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
|
||||
return {"image": 0}
|
||||
|
||||
|
||||
class PrithviGeoSpatialMAEInputBuilder(
|
||||
BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
|
||||
|
||||
@@ -109,17 +109,6 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"audio": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
hf_config = self.get_hf_config()
|
||||
max_source_positions = hf_config.audio_config.max_source_positions
|
||||
max_output_lengths = (max_source_positions - 2) // 2 + 1
|
||||
|
||||
return {"audio": max_output_lengths}
|
||||
|
||||
|
||||
class Qwen2AudioDummyInputsBuilder(
|
||||
BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
|
||||
|
||||
@@ -818,16 +818,6 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {
|
||||
"image": self.get_max_image_tokens(),
|
||||
"video": self.get_max_video_tokens(seq_len, mm_counts),
|
||||
}
|
||||
|
||||
def _get_vision_info(
|
||||
self,
|
||||
*,
|
||||
|
||||
@@ -530,13 +530,6 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_num_image_tokens()}
|
||||
|
||||
def get_num_image_tokens(self) -> int:
|
||||
hf_config = self.get_hf_config()
|
||||
vision_config = hf_config.visual
|
||||
|
||||
@@ -33,9 +33,6 @@ class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
|
||||
) -> int:
|
||||
return self.get_patch_grid_length()**2
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
return self.get_patch_grid_length()**2
|
||||
|
||||
def get_image_size(self) -> int:
|
||||
return self.vision_config.image_size
|
||||
|
||||
|
||||
@@ -459,13 +459,6 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
@@ -481,15 +474,6 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
|
||||
image_height=image_height,
|
||||
)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
processor=None,
|
||||
)
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
processor = self.get_hf_processor()
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
|
||||
"""PyTorch Ultravox model."""
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from functools import cached_property
|
||||
from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
@@ -107,17 +106,6 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"audio": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
max_audio_tokens = math.ceil(feature_extractor.chunk_length *
|
||||
_AUDIO_TOKENS_PER_SECOND)
|
||||
|
||||
return {"audio": max_audio_tokens * _MAX_ENCODER_BATCH_SIZE}
|
||||
|
||||
|
||||
class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
|
||||
):
|
||||
|
||||
@@ -33,10 +33,6 @@ class VisionEncoderInfo(ABC, Generic[_C]):
|
||||
) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_max_image_tokens(self) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_image_size(self) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -538,16 +538,9 @@ class WhisperProcessingInfo(BaseProcessingInfo):
|
||||
assert isinstance(feature_extractor, WhisperFeatureExtractor)
|
||||
return feature_extractor
|
||||
|
||||
def get_max_audio_tokens(self) -> int:
|
||||
def get_num_audio_tokens(self) -> int:
|
||||
return self.get_hf_config().max_source_positions
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"audio": self.get_max_audio_tokens()}
|
||||
|
||||
|
||||
class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
|
||||
|
||||
@@ -630,7 +623,7 @@ class WhisperMultiModalProcessor(
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
num_tokens = self.info.get_max_audio_tokens()
|
||||
num_tokens = self.info.get_num_audio_tokens()
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="audio",
|
||||
|
||||
Reference in New Issue
Block a user