[Refactor] Simplify dummy data generation (#35025)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -444,15 +444,14 @@ class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
vision_config = self.info.get_vision_config()
|
||||
|
||||
max_image_size = vision_config.image_size
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -252,16 +252,13 @@ class AudioFlamingo3DummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
feature_extractor = self.info.get_feature_extractor(
|
||||
**(mm_processor_kwargs or {})
|
||||
)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
sampling_rate = feature_extractor.sampling_rate
|
||||
audio_len = MAX_AUDIO_LEN * sampling_rate
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
|
||||
@@ -191,13 +191,12 @@ class AyaVisionDummyInputsBuilder(BaseDummyInputsBuilder[AyaVisionProcessingInfo
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
image_size = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -249,8 +249,7 @@ class BagelDummyInputsBuilder(BaseDummyInputsBuilder[BagelProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
hf_config = self.info.get_hf_config()
|
||||
@@ -258,7 +257,7 @@ class BagelDummyInputsBuilder(BaseDummyInputsBuilder[BagelProcessingInfo]):
|
||||
|
||||
# Use the configured image size
|
||||
image_size = vit_config.image_size
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -90,14 +90,13 @@ class BeeDummyInputsBuilder(LlavaDummyInputsBuilder[BeeProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -445,8 +445,7 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
hf_config = self.info.get_hf_config()
|
||||
vision_config = hf_config.vision_config
|
||||
@@ -454,7 +453,7 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
|
||||
max_image_size = vision_config.image_size
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -116,15 +116,14 @@ class ChameleonDummyInputsBuilder(BaseDummyInputsBuilder[ChameleonProcessingInfo
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
config = self.info.get_hf_config()
|
||||
|
||||
width = height = config.vq_config.resolution
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -174,14 +174,13 @@ class CLIPDummyInputsBuilder(BaseDummyInputsBuilder[CLIPProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -197,13 +197,12 @@ class Cohere2VisionDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
image_size = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -132,12 +132,12 @@ class ColModernVBertDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
width=target_width,
|
||||
|
||||
@@ -255,8 +255,7 @@ class DeepseekOCRDummyInputsBuilder(BaseDummyInputsBuilder[DeepseekOCRProcessing
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
|
||||
@@ -137,8 +137,7 @@ class DeepseekOCR2DummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
|
||||
@@ -214,14 +214,13 @@ class DeepseekVL2DummyInputsBuilder(BaseDummyInputsBuilder[DeepseekVL2Processing
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
max_image_size = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -106,17 +106,13 @@ class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
mm_processor_kwargs = mm_processor_kwargs or {}
|
||||
target_width, target_height = self.info.get_image_size_with_most_features( # noqa: E501
|
||||
mm_processor_kwargs.get("max_pixels", None)
|
||||
)
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -1168,8 +1168,7 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
@@ -1179,8 +1178,8 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
|
||||
seq_len, mm_counts
|
||||
)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -746,23 +746,22 @@ class FunASRDummyInputsBuilder(BaseDummyInputsBuilder[FunASRProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
feature_extractor = self.info.get_feature_extractor(
|
||||
**(mm_processor_kwargs or {})
|
||||
)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
|
||||
sampling_rate = feature_extractor.sampling_rate
|
||||
audio_len = feature_extractor.chunk_length * sampling_rate
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
length=audio_len, num_audios=num_audios, overrides=audio_overrides
|
||||
)
|
||||
length=audio_len,
|
||||
num_audios=num_audios,
|
||||
overrides=audio_overrides,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -610,12 +610,9 @@ class FunAudioChatDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
feature_extractor = self.info.get_feature_extractor(
|
||||
**(mm_processor_kwargs or {})
|
||||
)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
sampling_rate = int(feature_extractor.sampling_rate)
|
||||
|
||||
# Dummy inputs are used for profiling; construct the worst-case audio
|
||||
@@ -632,7 +629,7 @@ class FunAudioChatDummyInputsBuilder(
|
||||
)
|
||||
num_audios = int(mm_counts.get("audio", 0))
|
||||
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
length=audio_len,
|
||||
|
||||
@@ -142,13 +142,12 @@ class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -241,14 +241,13 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -175,8 +175,7 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
@@ -189,8 +188,8 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
|
||||
img_width = image_processor.size.get("width", 224)
|
||||
img_height = image_processor.size.get("height", 224)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
@@ -200,7 +199,9 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
|
||||
overrides=image_overrides,
|
||||
),
|
||||
"audio": self._get_dummy_audios(
|
||||
length=audio_len, num_audios=num_audios, overrides=audio_overrides
|
||||
length=audio_len,
|
||||
num_audios=num_audios,
|
||||
overrides=audio_overrides,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@@ -1163,8 +1163,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
@@ -1174,8 +1173,8 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
|
||||
seq_len, mm_counts
|
||||
)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -492,8 +492,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
hf_config = self.info.get_hf_config()
|
||||
vision_config = hf_config.vision_config
|
||||
@@ -501,7 +500,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
|
||||
target_width = target_height = vision_config["image_size"]
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -726,15 +726,12 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
feature_extractor = self.info.get_feature_extractor(
|
||||
**(mm_processor_kwargs or {})
|
||||
)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
sampling_rate = feature_extractor.sampling_rate
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
max_audio_len = getattr(
|
||||
self.info.get_hf_processor(), "max_audio_len", DEFAULT_MAX_AUDIO_LEN_S
|
||||
@@ -743,7 +740,9 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]):
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
length=audio_len, num_audios=num_audios, overrides=audio_overrides
|
||||
length=audio_len,
|
||||
num_audios=num_audios,
|
||||
overrides=audio_overrides,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -216,11 +216,10 @@ class GraniteSpeechDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
|
||||
@@ -713,8 +713,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 1)
|
||||
|
||||
|
||||
@@ -165,8 +165,7 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
@@ -174,8 +173,8 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
target_num_frames = 32
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -277,15 +277,14 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo])
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
hf_processor = self.info.get_hf_processor(**(mm_processor_kwargs or {}))
|
||||
hf_processor = self.info.get_hf_processor()
|
||||
image_processor: Idefics3ImageProcessor = hf_processor.image_processor
|
||||
longest_edge = image_processor.max_image_size["longest_edge"]
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -297,8 +297,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo])
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
target_num_frames = self.info.get_num_frames_with_most_features(
|
||||
@@ -310,8 +309,8 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo])
|
||||
config = self.info.get_hf_config()
|
||||
image_size_h, image_size_w = config.vision_config.image_size
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -762,13 +762,12 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
@@ -935,12 +934,9 @@ class InternVLDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
dummy_image = super().get_dummy_mm_data(
|
||||
seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
|
||||
)
|
||||
dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
|
||||
if self.info.supports_video:
|
||||
config = self.info.get_hf_config()
|
||||
image_size: int = config.vision_config.image_size
|
||||
@@ -948,7 +944,7 @@ class InternVLDummyInputsBuilder(
|
||||
seq_len, mm_counts
|
||||
)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
video_overrides = mm_options.get("video")
|
||||
dummy_video = {
|
||||
"video": self._get_dummy_videos(
|
||||
width=image_size,
|
||||
|
||||
@@ -18,6 +18,7 @@ from typing_extensions import TypedDict, Unpack
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.model import ModelConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
from vllm.model_executor.layers.attention import MMEncoderAttention
|
||||
@@ -849,13 +850,12 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -444,8 +444,7 @@ class KananaVDummyInputsBuilder(BaseDummyInputsBuilder[KananaVProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
return {
|
||||
|
||||
@@ -1170,8 +1170,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
@@ -1179,8 +1178,8 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
target_num_frames = self.info.get_num_frames_with_most_features(seq_len)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
mm_data = {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -240,8 +240,7 @@ class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
# TODO: Support mm_options for vision_chunk to allow user configuration
|
||||
dummy_items = self.get_dummy_mm_items()
|
||||
|
||||
@@ -215,12 +215,11 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -343,14 +343,13 @@ class Lfm2VLDummyInputsBuilder(BaseDummyInputsBuilder[Lfm2VLProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -232,14 +232,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -165,8 +165,7 @@ class LlavaNextVideoDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
|
||||
@@ -175,7 +174,7 @@ class LlavaNextVideoDummyInputsBuilder(
|
||||
seq_len, mm_counts
|
||||
)
|
||||
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
return {
|
||||
"video": self._get_dummy_videos(
|
||||
|
||||
@@ -276,8 +276,7 @@ class LlavaOnevisionDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
@@ -287,8 +286,8 @@ class LlavaOnevisionDummyInputsBuilder(
|
||||
seq_len, mm_counts
|
||||
)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -565,12 +565,11 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
|
||||
@@ -301,8 +301,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
audio_len = (
|
||||
@@ -310,11 +309,13 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn
|
||||
* self.info.get_default_audio_sampling_rate()
|
||||
)
|
||||
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
audio_mm_data = {
|
||||
"audio": self._get_dummy_audios(
|
||||
length=audio_len, num_audios=num_audios, overrides=audio_overrides
|
||||
length=audio_len,
|
||||
num_audios=num_audios,
|
||||
overrides=audio_overrides,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -707,8 +707,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
@@ -719,8 +718,8 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
seq_len, mm_counts
|
||||
)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -236,14 +236,13 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -707,14 +707,13 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
(target_width, target_height) = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -1274,13 +1274,12 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -2082,8 +2082,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
@@ -2094,7 +2093,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
|
||||
if num_images > 0:
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
dummy_images = self._get_dummy_images(
|
||||
width=target_width,
|
||||
@@ -2110,7 +2109,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
|
||||
seq_len, mm_counts
|
||||
)
|
||||
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
if video_overrides:
|
||||
assert isinstance(video_overrides, VideoDummyOptions)
|
||||
|
||||
@@ -1388,8 +1388,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
processor = self.info.get_hf_processor()
|
||||
@@ -1404,7 +1403,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
max_num_tiles
|
||||
)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
@@ -1461,12 +1460,9 @@ class NanoNemotronVLDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
dummy_image = super().get_dummy_mm_data(
|
||||
seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
|
||||
)
|
||||
dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
|
||||
if self.info.supports_video:
|
||||
config = self.info.get_hf_config()
|
||||
image_size: int = config.force_image_size
|
||||
@@ -1474,7 +1470,7 @@ class NanoNemotronVLDummyInputsBuilder(
|
||||
seq_len, mm_counts
|
||||
)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
video_overrides = mm_options.get("video")
|
||||
dummy_video = {
|
||||
"video": self._get_dummy_videos(
|
||||
width=image_size,
|
||||
|
||||
@@ -645,8 +645,7 @@ class NemotronParseDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
|
||||
@@ -92,13 +92,12 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo])
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -306,14 +306,13 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
mm_data = {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -287,8 +287,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
@@ -298,8 +297,8 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
|
||||
seq_len, mm_counts
|
||||
)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
mm_data = {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -206,13 +206,12 @@ class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessing
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
max_image_size = self.info.get_image_size_with_most_features()
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -131,8 +131,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
hf_config = self.info.get_hf_config()
|
||||
vision_config = hf_config.vision_config
|
||||
@@ -140,7 +139,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo
|
||||
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -376,14 +376,13 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -822,16 +822,15 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
mm_data = {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -249,14 +249,13 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
@@ -271,8 +270,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> ProcessorInputs:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
|
||||
@@ -357,15 +357,13 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
|
||||
mm_processor_kwargs = mm_processor_kwargs or {}
|
||||
feature_extractor = self.info.get_feature_extractor(**mm_processor_kwargs)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
|
||||
target_audio_length = (
|
||||
min(
|
||||
@@ -375,16 +373,14 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
|
||||
* feature_extractor.sampling_rate
|
||||
)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features(
|
||||
max_pixels=mm_processor_kwargs.get("max_pixels", None),
|
||||
)
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
target_num_frames = self.info.get_num_frames_with_most_features(
|
||||
seq_len, mm_counts
|
||||
)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
mm_data = {
|
||||
"audio": self._get_dummy_audios(
|
||||
|
||||
@@ -195,22 +195,21 @@ class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingIn
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
feature_extractor = self.info.get_feature_extractor(
|
||||
**(mm_processor_kwargs or {})
|
||||
)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
|
||||
sampling_rate = feature_extractor.sampling_rate
|
||||
audio_len = feature_extractor.chunk_length * sampling_rate
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
length=audio_len, num_audios=num_audios, overrides=audio_overrides
|
||||
length=audio_len,
|
||||
num_audios=num_audios,
|
||||
overrides=audio_overrides,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -925,9 +925,14 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
||||
vision_config = hf_config.vision_config
|
||||
patch_size = vision_config.patch_size
|
||||
merge_size = vision_config.spatial_merge_size
|
||||
|
||||
if max_pixels is None:
|
||||
image_processor = self.get_image_processor()
|
||||
max_pixels = image_processor.size["longest_edge"]
|
||||
|
||||
mm_kwargs = self.ctx.get_merged_mm_kwargs({})
|
||||
size = mm_kwargs.get("size", image_processor.size)
|
||||
max_pixels = size["longest_edge"]
|
||||
|
||||
unit = patch_size * merge_size
|
||||
max_seq_len = max_pixels // (unit * unit)
|
||||
|
||||
@@ -1027,22 +1032,18 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
|
||||
mm_processor_kwargs = mm_processor_kwargs or {}
|
||||
target_width, target_height = self.info.get_image_size_with_most_features(
|
||||
max_pixels=mm_processor_kwargs.get("max_pixels", None)
|
||||
)
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
target_num_frames = self.info.get_num_frames_with_most_features(
|
||||
seq_len, mm_counts
|
||||
)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -146,14 +146,11 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo])
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
|
||||
feature_extractor = self.info.get_feature_extractor(
|
||||
**(mm_processor_kwargs or {})
|
||||
)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
|
||||
target_audio_length = (
|
||||
min(
|
||||
@@ -163,7 +160,7 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo])
|
||||
* feature_extractor.sampling_rate
|
||||
)
|
||||
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
|
||||
@@ -703,11 +703,18 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> int:
|
||||
video_processor = self.get_video_processor()
|
||||
video_max_pixels = video_processor.size["longest_edge"]
|
||||
|
||||
mm_kwargs = self.ctx.get_merged_mm_kwargs({})
|
||||
video_size = mm_kwargs.get("size", video_processor.size)
|
||||
temporal_patch_size = mm_kwargs.get(
|
||||
"temporal_patch_size", video_processor.temporal_patch_size
|
||||
)
|
||||
|
||||
# video_max_pixels contains the temporal compression factor,
|
||||
# so we divide by 2 to get the maximum number of image pixels.
|
||||
video_max_pixels = video_size["longest_edge"]
|
||||
target_width, target_height = self.get_image_size_with_most_features(
|
||||
max_pixels=video_max_pixels // video_processor.temporal_patch_size
|
||||
max_pixels=video_max_pixels // temporal_patch_size
|
||||
)
|
||||
num_video_soft_tokens = self.get_num_video_tokens(
|
||||
image_width=target_width,
|
||||
@@ -789,19 +796,15 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
num_videos = mm_counts.get("video", 0)
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
video_overrides = mm_options.get("video")
|
||||
|
||||
mm_processor_kwargs = mm_processor_kwargs or {}
|
||||
target_image_width, target_image_height = (
|
||||
self.info.get_image_size_with_most_features(
|
||||
max_pixels=mm_processor_kwargs.get("max_pixels", None),
|
||||
)
|
||||
self.info.get_image_size_with_most_features()
|
||||
)
|
||||
|
||||
# treat videos as special images
|
||||
@@ -826,13 +829,20 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
|
||||
target_num_frames = min(target_num_frames, num_frames_override)
|
||||
target_num_frames = max(target_num_frames, 2)
|
||||
|
||||
video_processor = self.info.get_video_processor(**(mm_processor_kwargs or {}))
|
||||
video_max_pixels = video_processor.size["longest_edge"]
|
||||
video_processor = self.info.get_video_processor()
|
||||
|
||||
mm_kwargs = self.info.ctx.get_merged_mm_kwargs({})
|
||||
video_size = mm_kwargs.get("size", video_processor.size)
|
||||
temporal_patch_size = mm_kwargs.get(
|
||||
"temporal_patch_size", video_processor.temporal_patch_size
|
||||
)
|
||||
|
||||
# video_max_pixels contains the temporal compression factor,
|
||||
# so we divide by 2 to get the maximum number of image pixels.
|
||||
video_max_pixels = video_size["longest_edge"]
|
||||
target_video_width, target_video_height = (
|
||||
self.info.get_image_size_with_most_features(
|
||||
max_pixels=video_max_pixels // video_processor.temporal_patch_size
|
||||
max_pixels=video_max_pixels // temporal_patch_size
|
||||
)
|
||||
)
|
||||
target_video_size, _ = self.info._get_vision_info(
|
||||
|
||||
@@ -617,8 +617,7 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
hf_config = self.info.get_hf_config()
|
||||
vision_config = hf_config.visual
|
||||
@@ -626,7 +625,7 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
|
||||
target_width = target_height = vision_config["image_size"]
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -40,14 +40,13 @@ class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -158,14 +158,13 @@ class SiglipDummyInputsBuilder(BaseDummyInputsBuilder[SiglipProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -529,13 +529,12 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingIn
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -564,13 +564,12 @@ class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -154,8 +154,7 @@ class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
# Dummy data is generated based on the 'input' section
|
||||
# defined in the HF configuration file
|
||||
|
||||
@@ -101,14 +101,13 @@ class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder[MultiModalProcessingIn
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, "BaseDummyOptions"] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, "BaseDummyOptions"],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_max_image_size()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
image_overrides = mm_options.get("image")
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
|
||||
@@ -164,12 +164,9 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo])
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
feature_extractor = self.info.get_feature_extractor(
|
||||
**(mm_processor_kwargs or {})
|
||||
)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
|
||||
sampling_rate = feature_extractor.sampling_rate
|
||||
audio_len = (
|
||||
@@ -177,11 +174,13 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo])
|
||||
)
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
length=audio_len, num_audios=num_audios, overrides=audio_overrides
|
||||
length=audio_len,
|
||||
num_audios=num_audios,
|
||||
overrides=audio_overrides,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -218,18 +218,19 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
|
||||
target_length = self.info.get_max_audio_array_len()
|
||||
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
length=target_length, num_audios=num_audios, overrides=audio_overrides
|
||||
length=target_length,
|
||||
num_audios=num_audios,
|
||||
overrides=audio_overrides,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -237,8 +238,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> ProcessorInputs:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
|
||||
@@ -695,22 +695,21 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_processor_kwargs: Mapping[str, object] | None = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
feature_extractor = self.info.get_feature_extractor(
|
||||
**(mm_processor_kwargs or {})
|
||||
)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
|
||||
sampling_rate = feature_extractor.sampling_rate
|
||||
audio_len = feature_extractor.chunk_length * sampling_rate
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
|
||||
audio_overrides = mm_options.get("audio") if mm_options else None
|
||||
audio_overrides = mm_options.get("audio")
|
||||
|
||||
return {
|
||||
"audio": self._get_dummy_audios(
|
||||
length=audio_len, num_audios=num_audios, overrides=audio_overrides
|
||||
length=audio_len,
|
||||
num_audios=num_audios,
|
||||
overrides=audio_overrides,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user