[Bugfix] Add phi3v resize for dynamic shape and fix torchvision requirement (#5772)

This commit is contained in:
Isotr0py
2024-06-24 12:11:53 +08:00
committed by GitHub
parent 5d4d90536f
commit edd5fe5fa2
5 changed files with 72 additions and 5 deletions

View File

@@ -13,14 +13,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
import numpy as np
import torch
import torch.nn as nn
from PIL import Image
from transformers import CLIPVisionConfig, PretrainedConfig
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, VisionLanguageConfig
from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
@@ -32,9 +35,11 @@ from vllm.model_executor.models.llama import LlamaModel
from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import get_dummy_image_data
from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
from vllm.sequence import SamplerOutput
logger = init_logger(__name__)
_KEYS_TO_MODIFY_MAPPING = {
"model.vision_embed_tokens": "vision_embed_tokens",
}
@@ -268,7 +273,63 @@ class Phi3VImagePixelInputs(TypedDict):
"""Shape: (batch_size, 2)"""
@MULTIMODAL_REGISTRY.register_image_pixel_input()
# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported
# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
def calc_padded_size(width, height, padding_unit=336):
target_height = int(np.ceil(height / padding_unit) * padding_unit)
top_padding = int((target_height - height) / 2)
bottom_padding = target_height - height - top_padding
padded_width = width
padded_height = height + top_padding + bottom_padding
return padded_width, padded_height
# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
def calc_hd_transform_size(width, height, hd_num=16):
transposed = False
if width < height:
width, height = height, width
transposed = True
ratio = width / height
scale = 1
while scale * np.ceil(scale / ratio) <= hd_num:
scale += 1
scale -= 1
new_width = int(scale * 336)
new_height = int(new_width / ratio)
padded_width, padded_height = calc_padded_size(new_width, new_height)
if transposed:
padded_width, padded_height = padded_height, padded_width
return padded_width, padded_height
def _image_processor(
data: ImagePixelData,
model_config: ModelConfig,
vlm_config: VisionLanguageConfig,
) -> Dict[str, torch.Tensor]:
image = data.image
if isinstance(image, Image.Image):
# Temporary patch before dynamic number of image tokens is supported
_, _, h, w = vlm_config.image_input_shape
if (w, h) != calc_hd_transform_size(image.width, image.height):
logger.warning(
"Dynamic image shape is currently not supported. "
"Resizing input image to (%d, %d).", w, h)
data.image = image.resize((w, h))
return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
._default_input_processor(data, model_config, vlm_config)
@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor)
@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
class Phi3VForCausalLM(VisionLanguageModelBase):