[Bugfix] fix encoder cache hang in Qwen3VL (#32684)
Signed-off-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -48,18 +48,6 @@ VideoInput: TypeAlias = (
|
||||
AudioInput = list[tuple[np.ndarray, int]]
|
||||
|
||||
|
||||
MM_OPTIONS_OVERRIDES = {
|
||||
# Qwen3-VL's default profiling video size (64x64) can cause trouble
|
||||
# after resizing, so we override it here for testing.
|
||||
"qwen3_vl": dict(
|
||||
video=VideoDummyOptions(num_frames=128, width=256, height=256),
|
||||
),
|
||||
"qwen3_vl_moe": dict(
|
||||
video=VideoDummyOptions(num_frames=128, width=256, height=256),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _resize_data(
|
||||
_data: Image.Image | np.ndarray, size_factor: float
|
||||
) -> Image.Image | np.ndarray:
|
||||
@@ -73,12 +61,12 @@ def _resize_data(
|
||||
elif is_list_of(_data, Image.Image):
|
||||
W, H = next(iter(_data)).width, next(iter(_data)).height
|
||||
T = len(_data)
|
||||
T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H))
|
||||
T, W, H = map(lambda x: max(int(x * size_factor), 2), (T, W, H))
|
||||
return [d.resize((W, H)) for d in _data[:T]]
|
||||
# Video input with numpy arrays
|
||||
elif isinstance(_data, np.ndarray) and _data.ndim >= 4:
|
||||
T, H, W, C = _data.shape[-4:]
|
||||
T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W))
|
||||
T, H, W = map(lambda x: max(int(x * size_factor), 2), (T, H, W))
|
||||
return _data[..., :T, :H, :W, :C]
|
||||
# Audio input
|
||||
elif isinstance(_data, np.ndarray) and _data.ndim == 1:
|
||||
@@ -103,8 +91,6 @@ def create_batched_mm_kwargs(
|
||||
processor: BaseMultiModalProcessor,
|
||||
size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
|
||||
) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
|
||||
model_type = model_config.hf_config.model_type
|
||||
|
||||
processing_info = processor.info
|
||||
dummy_inputs = processor.dummy_inputs
|
||||
supported_mm_limits = processing_info.get_supported_mm_limits()
|
||||
@@ -115,7 +101,6 @@ def create_batched_mm_kwargs(
|
||||
processor_inputs = dummy_inputs.get_dummy_processor_inputs(
|
||||
seq_len=model_config.max_model_len,
|
||||
mm_counts=mm_counts,
|
||||
mm_options=MM_OPTIONS_OVERRIDES.get(model_type),
|
||||
)
|
||||
mm_data = processor_inputs.mm_data
|
||||
resized_mm_data = {
|
||||
|
||||
@@ -892,7 +892,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
||||
)
|
||||
return num_video_tokens
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
def get_image_size_with_most_features(
|
||||
self, max_pixels: int | None = None
|
||||
) -> ImageSize:
|
||||
# NOTE: Simply processing a huge size with _get_vision_info might not give a
|
||||
# size that maximizes the number of featrues, i.e., the number of (merged)
|
||||
# patches. This is because the number of patches limits the allowed aspect
|
||||
@@ -910,8 +912,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
||||
vision_config = hf_config.vision_config
|
||||
patch_size = vision_config.patch_size
|
||||
merge_size = vision_config.spatial_merge_size
|
||||
if max_pixels is None:
|
||||
image_processor = self.get_image_processor()
|
||||
max_pixels = image_processor.max_pixels or image_processor.size["longest_edge"]
|
||||
max_pixels = (
|
||||
image_processor.max_pixels or image_processor.size["longest_edge"]
|
||||
)
|
||||
unit = patch_size * merge_size
|
||||
max_seq_len = max_pixels // (unit * unit)
|
||||
|
||||
|
||||
@@ -91,6 +91,7 @@ from vllm.multimodal.processing import (
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.math_utils import round_up
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
from .interfaces import (
|
||||
@@ -129,8 +130,9 @@ from .vision import (
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# Official recommended max frames is 2048
|
||||
_MAX_FRAMES_PER_VIDEO = 2048
|
||||
# We use 2048 dummy video frames that would generate vision embeddings
|
||||
# of the maximum size.
|
||||
DUMMY_VIDEO_NUM_FRAMES = 2048
|
||||
|
||||
|
||||
class Qwen3_VisionPatchEmbed(nn.Module):
|
||||
@@ -662,7 +664,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
|
||||
else:
|
||||
preprocessed_size = ImageSize(width=image_width, height=image_height)
|
||||
|
||||
padded_num_frames = num_frames + num_frames % temporal_patch_size
|
||||
padded_num_frames = round_up(num_frames, temporal_patch_size)
|
||||
|
||||
grid_t = max(padded_num_frames // temporal_patch_size, 1)
|
||||
grid_h = preprocessed_size.height // patch_size
|
||||
@@ -684,7 +686,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> int:
|
||||
return super().get_num_frames_with_most_features(
|
||||
seq_len, mm_counts, max_frames_per_video=_MAX_FRAMES_PER_VIDEO
|
||||
seq_len, mm_counts, max_frames_per_video=DUMMY_VIDEO_NUM_FRAMES
|
||||
)
|
||||
|
||||
def get_max_video_tokens(
|
||||
@@ -692,11 +694,17 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
video_processor = self.get_video_processor()
|
||||
video_max_pixels = video_processor.size["longest_edge"]
|
||||
# video_max_pixels contains the temporal compression factor,
|
||||
# so we divide by 2 to get the maximum number of image pixels.
|
||||
target_width, target_height = self.get_image_size_with_most_features(
|
||||
max_pixels=video_max_pixels // video_processor.temporal_patch_size
|
||||
)
|
||||
num_video_soft_tokens = self.get_num_video_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
|
||||
num_frames=2,
|
||||
image_processor=None,
|
||||
)
|
||||
return num_video_soft_tokens
|
||||
@@ -779,11 +787,12 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
video_overrides = mm_options.get("video") if mm_options else None
|
||||
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
target_num_frames = self.info.get_num_frames_with_most_features(
|
||||
seq_len, mm_counts
|
||||
target_image_width, target_image_height = (
|
||||
self.info.get_image_size_with_most_features()
|
||||
)
|
||||
|
||||
# treat videos as special images
|
||||
target_num_frames = 2
|
||||
if video_overrides:
|
||||
assert isinstance(video_overrides, VideoDummyOptions)
|
||||
num_frames_override = video_overrides.num_frames
|
||||
@@ -804,48 +813,60 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
|
||||
target_num_frames = min(target_num_frames, num_frames_override)
|
||||
target_num_frames = max(target_num_frames, 2)
|
||||
|
||||
video_processor = self.info.get_video_processor()
|
||||
video_max_pixels = video_processor.size["longest_edge"]
|
||||
# video_max_pixels contains the temporal compression factor,
|
||||
# so we divide by 2 to get the maximum number of image pixels.
|
||||
target_video_width, target_video_height = (
|
||||
self.info.get_image_size_with_most_features(
|
||||
max_pixels=video_max_pixels // video_processor.temporal_patch_size
|
||||
)
|
||||
)
|
||||
target_video_size, _ = self.info._get_vision_info(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
image_width=target_video_width,
|
||||
image_height=target_video_height,
|
||||
num_frames=target_num_frames,
|
||||
image_processor=self.info.get_video_processor(),
|
||||
image_processor=video_processor,
|
||||
)
|
||||
# NOTE: we need to do this check here since Qwen3-VL resizes video
|
||||
# frames depending on how many frames there are.
|
||||
width, height = target_video_size.width, target_video_size.height
|
||||
target_video_width, target_video_height = (
|
||||
target_video_size.width,
|
||||
target_video_size.height,
|
||||
)
|
||||
if video_overrides:
|
||||
assert isinstance(video_overrides, VideoDummyOptions)
|
||||
width_override = video_overrides.width
|
||||
if width_override:
|
||||
if width_override > width:
|
||||
if width_override > target_video_width:
|
||||
logger.warning(
|
||||
"video.width override (%d) exceeds model's "
|
||||
"maximum width (%d), will be ignored",
|
||||
width_override,
|
||||
width,
|
||||
target_video_width,
|
||||
)
|
||||
width = min(width, width_override)
|
||||
target_video_width = min(target_video_width, width_override)
|
||||
height_override = video_overrides.height
|
||||
if height_override:
|
||||
if height_override > height:
|
||||
if height_override > target_video_height:
|
||||
logger.warning(
|
||||
"video.height override (%d) exceeds model's "
|
||||
"maximum height (%d), will be ignored",
|
||||
height_override,
|
||||
height,
|
||||
target_video_height,
|
||||
)
|
||||
height = min(height, height_override)
|
||||
target_video_height = min(target_video_height, height_override)
|
||||
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
width=target_width,
|
||||
height=target_height,
|
||||
width=target_image_width,
|
||||
height=target_image_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides,
|
||||
),
|
||||
"video": self._get_dummy_videos(
|
||||
width=width,
|
||||
height=height,
|
||||
width=target_video_width,
|
||||
height=target_video_height,
|
||||
num_frames=target_num_frames,
|
||||
num_videos=num_videos,
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user