[Model] Add Phi4ForCausalLMV for microsoft/Phi-4-reasoning-vision-15B (#38306)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
This commit is contained in:
committed by
GitHub
parent
201d2ea5bf
commit
2ad7c0335f
@@ -18,5 +18,6 @@ steps:
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||
- pytest models/multimodal/generation/test_phi4siglip.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/generation/test_phi4siglip.py
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||
|
||||
187
tests/models/multimodal/generation/test_phi4siglip.py
Normal file
187
tests/models/multimodal/generation/test_phi4siglip.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
PromptImageInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import multi_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODEL_ID = "microsoft/Phi-4-reasoning-vision-15B"
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|user|>\n<image>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom": "<|user|>\n<image>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = (
|
||||
"<|user|>\n<image>\n<image>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
)
|
||||
|
||||
DTYPE = "half"
|
||||
MAX_TOKENS = 128
|
||||
NUM_LOGPROBS = 10
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
output_str_without_image = re.sub(r"(<image>)+", "", output_str)
|
||||
if output_str_without_image and output_str_without_image[0] == " ":
|
||||
output_str_without_image = output_str_without_image[1:]
|
||||
|
||||
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
|
||||
hf_output_ids = tokenizer.encode(output_str_without_image)
|
||||
if hf_output_ids and hf_output_ids[0] == tokenizer.bos_token_id:
|
||||
hf_output_ids = hf_output_ids[1:]
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def _build_single_image_inputs(
|
||||
image_assets,
|
||||
) -> list[tuple[list[str], PromptImageInput]]:
|
||||
"""Build single-image inputs for all size_factors at once."""
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
all_inputs: list[tuple[list[str], PromptImageInput]] = []
|
||||
for size_factors in [[1.0], [0.25, 0.5, 1.0]]:
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS):
|
||||
all_inputs.append(
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, f) for f in size_factors],
|
||||
)
|
||||
)
|
||||
return all_inputs
|
||||
|
||||
|
||||
def _build_multi_image_inputs(
|
||||
image_assets,
|
||||
) -> list[tuple[list[str], PromptImageInput]]:
|
||||
"""Build multi-image inputs for all size_factors at once."""
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
all_inputs: list[tuple[list[str], PromptImageInput]] = []
|
||||
for size_factors in [[0.5], [0.15, 0.30]]:
|
||||
all_inputs.append(
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
)
|
||||
)
|
||||
return all_inputs
|
||||
|
||||
|
||||
def _run_and_compare(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
all_inputs: Sequence[tuple[list[str], PromptImageInput]],
|
||||
model: str,
|
||||
max_model_len: int,
|
||||
max_num_seqs: int,
|
||||
mm_limit: int,
|
||||
gpu_memory_utilization: float,
|
||||
):
|
||||
"""Load each runner once, run all inputs, then compare."""
|
||||
# NOTE: run vLLM first, then HF. vLLM needs a fresh process without
|
||||
# cuda initialization; running HF first would break the multiprocessing
|
||||
# backend with fork method.
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
dtype=DTYPE,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=2,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
MAX_TOKENS,
|
||||
num_logprobs=NUM_LOGPROBS,
|
||||
images=images,
|
||||
)
|
||||
for prompts, images in all_inputs
|
||||
]
|
||||
|
||||
hf_model_kwargs = {"_attn_implementation": "sdpa", "device_map": "auto"}
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=DTYPE,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
auto_cls=AutoModelForCausalLM,
|
||||
trust_remote_code=True,
|
||||
) as hf_model:
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
MAX_TOKENS,
|
||||
num_logprobs=NUM_LOGPROBS,
|
||||
images=images,
|
||||
)
|
||||
for prompts, images in all_inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("model", [MODEL_ID])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model) -> None:
|
||||
all_inputs = _build_single_image_inputs(image_assets)
|
||||
_run_and_compare(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
all_inputs,
|
||||
model,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
mm_limit=1,
|
||||
gpu_memory_utilization=0.80,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("model", [MODEL_ID])
|
||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model) -> None:
|
||||
all_inputs = _build_multi_image_inputs(image_assets)
|
||||
_run_and_compare(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
all_inputs,
|
||||
model,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
mm_limit=2,
|
||||
gpu_memory_utilization=0.80,
|
||||
)
|
||||
@@ -1049,6 +1049,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
}, # noqa: E501
|
||||
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"},
|
||||
),
|
||||
"Phi4ForCausalLMV": _HfExamplesInfo(
|
||||
"microsoft/Phi-4-reasoning-vision-15B", trust_remote_code=True
|
||||
),
|
||||
"Phi4MMForCausalLM": _HfExamplesInfo(
|
||||
"microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
|
||||
),
|
||||
|
||||
429
vllm/model_executor/models/phi4siglip.py
Normal file
429
vllm/model_executor/models/phi4siglip.py
Normal file
@@ -0,0 +1,429 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""vLLM support for microsoft/Phi-4-reasoning-vision-15B.
|
||||
|
||||
Architecture: Siglip2 vision tower + MLP projector + Phi3 language model.
|
||||
"""
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Annotated, Any, Literal
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import BatchFeature, PretrainedConfig, Siglip2VisionConfig
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.inputs import MultiModalDataDict
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import (
|
||||
ImageSize,
|
||||
MultiModalDataItems,
|
||||
)
|
||||
from vllm.multimodal.processing import (
|
||||
BaseDummyInputsBuilder,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
)
|
||||
from vllm.multimodal.processing.processor import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
from .lfm2_siglip2 import Siglip2Model
|
||||
from .llava import LlavaMultiModalProjector
|
||||
from .utils import (
|
||||
AutoWeightsLoader,
|
||||
WeightsMapper,
|
||||
init_vllm_registered_model,
|
||||
maybe_prefix,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
IMAGE_TOKEN_INDEX = -200
|
||||
DEFAULT_IMAGE_TOKEN = "<image>"
|
||||
|
||||
# The HF processor replaces "<image>" with IMAGE_TOKEN_INDEX (-200) in input_ids.
|
||||
# Negative token IDs cause OverflowError during decoding, so we remap to a real
|
||||
# in-vocabulary token. The Phi-4-reasoning-vision tokenizer ships with reserved
|
||||
# dummy tokens (<|dummy_0|> … <|dummy_83|>); we reuse the first one as the
|
||||
# image placeholder. This mirrors how Phi-3-vision uses its dedicated <|image|>
|
||||
# token (ID 32044).
|
||||
_IMAGE_TOKEN_ID = 100256 # <|dummy_0|> in the Phi-4 tokenizer
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Processing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Phi4SiglipProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
return {"image": None}
|
||||
|
||||
def _get_vision_config(self) -> dict:
|
||||
return self.get_hf_config().vision_config # type: ignore[attr-defined]
|
||||
|
||||
def _get_patch_size(self) -> int:
|
||||
vc = self._get_vision_config()
|
||||
if isinstance(vc, dict):
|
||||
return vc.get("patch_size", 16)
|
||||
return getattr(vc, "patch_size", 16)
|
||||
|
||||
def _get_max_num_patches(self) -> int:
|
||||
return getattr(self.get_hf_config(), "max_num_patches", 3600)
|
||||
|
||||
def _get_min_num_patches(self) -> int:
|
||||
return getattr(self.get_hf_config(), "min_num_patches", 256)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
patch_size = self._get_patch_size()
|
||||
min_patches = self._get_min_num_patches()
|
||||
max_patches = self._get_max_num_patches()
|
||||
|
||||
num_patches_h = image_height // patch_size
|
||||
num_patches_w = image_width // patch_size
|
||||
num_patches = max(num_patches_h * num_patches_w, 1)
|
||||
num_patches = max(min(num_patches, max_patches), min_patches)
|
||||
return num_patches
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
patch_size = self._get_patch_size()
|
||||
max_patches = self._get_max_num_patches()
|
||||
side = int(math.sqrt(max_patches)) * patch_size
|
||||
return ImageSize(width=side, height=side)
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self, seq_len: int, mm_counts: Mapping[str, int]
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self._get_max_num_patches()}
|
||||
|
||||
|
||||
class Phi4SiglipDummyInputsBuilder(
|
||||
BaseDummyInputsBuilder[Phi4SiglipProcessingInfo],
|
||||
):
|
||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
return DEFAULT_IMAGE_TOKEN * num_images
|
||||
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
size = self.info.get_image_size_with_most_features()
|
||||
return {
|
||||
"image": self._get_dummy_images(
|
||||
width=size.width,
|
||||
height=size.height,
|
||||
num_images=num_images,
|
||||
overrides=mm_options.get("image"),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class Phi4SiglipMultiModalProcessor(
|
||||
BaseMultiModalProcessor[Phi4SiglipProcessingInfo],
|
||||
):
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
# The HF processor's tokenizer_image_token() replaces the "<image>"
|
||||
# string with IMAGE_TOKEN_INDEX (-200) in input_ids. This breaks
|
||||
# vLLM's prompt-replacement pipeline which needs to find "<image>"
|
||||
# as normal sub-tokens. Re-tokenize with the plain tokenizer so
|
||||
# that "<image>" stays as sub-tokens and can be located by
|
||||
# PromptReplacement.
|
||||
# NOTE: tokenizer.__call__() (not .encode()) must be used so that
|
||||
# added/special tokens like <|user|>, <|end|> are kept as single IDs.
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
new_ids = tokenizer(prompt).input_ids
|
||||
processed["input_ids"] = torch.tensor([new_ids])
|
||||
|
||||
return processed
|
||||
|
||||
def _hf_processor_applies_updates(
|
||||
self,
|
||||
prompt_text: str,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> bool:
|
||||
# The HF processor replaces "<image>" with a single -200 placeholder
|
||||
# but does NOT expand it into N vision-encoder tokens. Since we also
|
||||
# re-tokenize the prompt (see _call_hf_processor), prompt updates are
|
||||
# never applied by the HF processor — vLLM handles the expansion via
|
||||
# _apply_prompt_updates.
|
||||
return False
|
||||
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(
|
||||
pixel_values=MultiModalFieldConfig.batched("image"),
|
||||
pixel_attention_mask=MultiModalFieldConfig.batched("image"),
|
||||
spatial_shapes=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
|
||||
)
|
||||
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
def get_replacement(item_idx: int):
|
||||
# Read the actual patch grid from the NaFlex processor's
|
||||
# spatial_shapes output (same pattern as LFM2-VL). This avoids
|
||||
# predicting from raw image dimensions, which can diverge from
|
||||
# the NaFlex resize/tile logic.
|
||||
out_item = out_mm_kwargs["image"][item_idx]
|
||||
spatial_shapes = out_item["spatial_shapes"].data
|
||||
assert isinstance(spatial_shapes, torch.Tensor)
|
||||
num_tokens = int(spatial_shapes.prod().item())
|
||||
return [_IMAGE_TOKEN_ID] * num_tokens
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=DEFAULT_IMAGE_TOKEN,
|
||||
replacement=get_replacement,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Input schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Phi4SiglipImagePixelInputs(TensorSchema):
|
||||
"""
|
||||
Dimensions:
|
||||
- bn: Batch size * number of images
|
||||
- d: Max number of patches (padded across images in the batch)
|
||||
- fd: Features per patch (patch_size * patch_size * channels)
|
||||
"""
|
||||
|
||||
type: Literal["pixel_values"] = "pixel_values"
|
||||
pixel_values: Annotated[torch.Tensor, TensorShape("bn", "d", "fd")]
|
||||
pixel_attention_mask: Annotated[torch.Tensor, TensorShape("bn", "d")]
|
||||
spatial_shapes: Annotated[torch.Tensor, TensorShape("bn", 2)]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Model
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
Phi4SiglipMultiModalProcessor,
|
||||
info=Phi4SiglipProcessingInfo,
|
||||
dummy_inputs=Phi4SiglipDummyInputsBuilder,
|
||||
)
|
||||
class Phi4ForCausalLMV(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
orig_to_new_prefix={
|
||||
"model.vision_tower.vision_tower.vision_model.head.": None,
|
||||
"model.vision_tower.vision_tower.": "vision_tower.",
|
||||
"model.mm_projector.0.": "multi_modal_projector.linear_1.",
|
||||
"model.mm_projector.2.": "multi_modal_projector.linear_2.",
|
||||
"lm_head.": "language_model.lm_head.",
|
||||
"model.": "language_model.model.",
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||
if modality.startswith("image"):
|
||||
return DEFAULT_IMAGE_TOKEN
|
||||
raise ValueError("Only image modality is supported")
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
|
||||
super().__init__()
|
||||
|
||||
config: PretrainedConfig = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
self.config = config
|
||||
|
||||
vision_config_dict: dict = getattr(config, "vision_config", {})
|
||||
if isinstance(vision_config_dict, dict):
|
||||
if "patch_size" not in vision_config_dict:
|
||||
vision_config_dict["patch_size"] = 16
|
||||
siglip2_config = Siglip2VisionConfig(**vision_config_dict)
|
||||
else:
|
||||
siglip2_config = vision_config_dict
|
||||
|
||||
vision_hidden_size: int = config.mm_hidden_size # type: ignore[attr-defined]
|
||||
text_hidden_size: int = config.hidden_size # type: ignore[attr-defined]
|
||||
|
||||
with self._mark_tower_model(vllm_config, "image"):
|
||||
layer_idx = -2
|
||||
num_hidden_layers = siglip2_config.num_hidden_layers + layer_idx + 1
|
||||
|
||||
self.vision_tower = Siglip2Model(
|
||||
siglip2_config,
|
||||
quant_config=quant_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
self.multi_modal_projector = LlavaMultiModalProjector(
|
||||
vision_hidden_size=vision_hidden_size,
|
||||
text_hidden_size=text_hidden_size,
|
||||
projector_hidden_act="gelu",
|
||||
multimodal_projector_bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "multi_modal_projector"),
|
||||
)
|
||||
|
||||
with self._mark_language_model(vllm_config):
|
||||
self.language_model = init_vllm_registered_model(
|
||||
vllm_config=vllm_config,
|
||||
hf_config=config,
|
||||
prefix=maybe_prefix(prefix, "language_model"),
|
||||
architectures=["Phi3ForCausalLM"],
|
||||
)
|
||||
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.language_model.make_empty_intermediate_tensors
|
||||
)
|
||||
|
||||
self.configure_mm_token_handling(
|
||||
vocab_size=config.vocab_size, # type: ignore[attr-defined]
|
||||
mm_token_ids=[_IMAGE_TOKEN_ID],
|
||||
)
|
||||
|
||||
def _packed_from_padded(
|
||||
self,
|
||||
pixel_values: torch.Tensor,
|
||||
pixel_attention_mask: torch.Tensor,
|
||||
spatial_shapes: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Convert padded NaFlex tensors to packed format for Siglip2Model."""
|
||||
valid_counts = pixel_attention_mask.sum(dim=1).to(torch.int32)
|
||||
pixel_values_packed = pixel_values[pixel_attention_mask.bool()]
|
||||
cu_seqlens = torch.zeros(
|
||||
len(valid_counts) + 1,
|
||||
dtype=torch.int32,
|
||||
device=pixel_values.device,
|
||||
)
|
||||
cu_seqlens[1:] = valid_counts.cumsum(0)
|
||||
max_seqlen = valid_counts.max()
|
||||
return (
|
||||
pixel_values_packed,
|
||||
spatial_shapes,
|
||||
cu_seqlens,
|
||||
max_seqlen,
|
||||
)
|
||||
|
||||
def _parse_and_validate_image_input(
|
||||
self, **kwargs: object
|
||||
) -> Phi4SiglipImagePixelInputs | None:
|
||||
pixel_values = kwargs.pop("pixel_values", None)
|
||||
pixel_attention_mask = kwargs.pop("pixel_attention_mask", None)
|
||||
spatial_shapes = kwargs.pop("spatial_shapes", None)
|
||||
if pixel_values is None:
|
||||
return None
|
||||
|
||||
return Phi4SiglipImagePixelInputs(
|
||||
type="pixel_values",
|
||||
pixel_values=pixel_values,
|
||||
pixel_attention_mask=pixel_attention_mask,
|
||||
spatial_shapes=spatial_shapes,
|
||||
)
|
||||
|
||||
def _process_image_input(
|
||||
self, image_input: Phi4SiglipImagePixelInputs
|
||||
) -> MultiModalEmbeddings:
|
||||
pixel_values = image_input["pixel_values"]
|
||||
pixel_attention_mask = image_input["pixel_attention_mask"]
|
||||
spatial_shapes = image_input["spatial_shapes"]
|
||||
|
||||
(
|
||||
pixel_values_packed,
|
||||
spatial_shapes_packed,
|
||||
cu_seqlens,
|
||||
max_seqlen,
|
||||
) = self._packed_from_padded(pixel_values, pixel_attention_mask, spatial_shapes)
|
||||
|
||||
vision_features = self.vision_tower(
|
||||
pixel_values_packed=pixel_values_packed,
|
||||
spatial_shapes=spatial_shapes_packed,
|
||||
cu_seqlens=cu_seqlens,
|
||||
max_seqlen=max_seqlen,
|
||||
select_layers=[-2],
|
||||
)
|
||||
|
||||
if vision_features.dim() == 3:
|
||||
vision_features = vision_features.squeeze(0)
|
||||
|
||||
image_features = self.multi_modal_projector(vision_features)
|
||||
|
||||
valid_counts = pixel_attention_mask.sum(dim=1).tolist()
|
||||
return torch.split(image_features, [int(c) for c in valid_counts])
|
||||
|
||||
def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
return []
|
||||
|
||||
return self._process_image_input(image_input)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor | None,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: IntermediateTensors | None = None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
**kwargs: object,
|
||||
) -> torch.Tensor | IntermediateTensors:
|
||||
if intermediate_tensors is not None:
|
||||
inputs_embeds = None
|
||||
|
||||
hidden_states = self.language_model.model(
|
||||
input_ids,
|
||||
positions,
|
||||
intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds,
|
||||
)
|
||||
return hidden_states
|
||||
|
||||
def compute_logits(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> torch.Tensor | None:
|
||||
return self.language_model.compute_logits(hidden_states)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
|
||||
@@ -481,6 +481,7 @@ _MULTIMODAL_MODELS = {
|
||||
"PaliGemmaForConditionalGeneration",
|
||||
),
|
||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||
"Phi4ForCausalLMV": ("phi4siglip", "Phi4ForCausalLMV"),
|
||||
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
|
||||
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),
|
||||
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),
|
||||
|
||||
Reference in New Issue
Block a user