diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index d57186a32..edec87e6f 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `GlmOcrForConditionalGeneration` | GLM-OCR | T + IE+ | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I+ + V+ | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | +| `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I+ + V+ | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + IE+ | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | diff --git a/tests/entrypoints/openai/test_realtime_validation.py b/tests/entrypoints/openai/test_realtime_validation.py index 9a45ac293..9092aac5b 100644 --- a/tests/entrypoints/openai/test_realtime_validation.py +++ b/tests/entrypoints/openai/test_realtime_validation.py @@ -118,7 +118,7 @@ async def test_multi_chunk_streaming( # JIT compilation warmup_done = False while not warmup_done: - event = await receive_event(ws, timeout=360.0) + event = await receive_event(ws, timeout=600.0) if event["type"] in ("transcription.done", "error"): warmup_done = True diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 36e8b0c0b..015770991 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1458,6 +1458,38 @@ def test_parse_chat_messages_context_text_format( assert mm_uuids is None +def test_parse_chat_messages_openai_format_image_url( + phi3v_model_config, + image_url, +): + content = [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "What's in the image?"}, + ] + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": "user", + "content": content, + } + ], + phi3v_model_config, + content_format="openai", + ) + + assert conversation == [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What's in the image?"}, + ], + } + ] + _assert_mm_data_is_image_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) + + def test_parse_chat_messages_rejects_too_many_images_in_one_message( phi3v_model_config, image_url, diff --git a/tests/models/registry.py b/tests/models/registry.py index 48e5c251d..5dd0a9f11 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -313,6 +313,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "HunYuanMoEV1ForCausalLM": _HfExamplesInfo( "tencent/Hunyuan-A13B-Instruct", trust_remote_code=True ), + "HyperCLOVAXForCausalLM": _HfExamplesInfo( + "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B", + trust_remote_code=True, + ), "InternLMForCausalLM": _HfExamplesInfo( "internlm/internlm-chat-7b", trust_remote_code=True ), @@ -793,6 +797,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", trust_remote_code=True, ), + "HCXVisionV2ForCausalLM": _HfExamplesInfo( + "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B", + trust_remote_code=True, + ), "HunYuanVLForConditionalGeneration": _HfExamplesInfo( "tencent/HunyuanOCR", hf_overrides={"num_experts": 0}, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 5ffb60719..4839fc80c 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1428,6 +1428,8 @@ def _parse_chat_message_content_part( with multimodal placeholders. """ if isinstance(part, str): # Handle plain text parts + if wrap_dicts: + return {"type": "text", "text": part} return part # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) @@ -1487,11 +1489,9 @@ def _parse_chat_message_content_part( else: raise NotImplementedError(f"Unknown part type: {part_type}") - return ( - {"type": modality} - if wrap_dicts - else (MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None) - ) + if wrap_dicts: + return {"type": modality} + return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None # No need to validate using Pydantic again diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 5b0dfe457..35f9cae26 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -325,7 +325,7 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - return dict( + fields = dict( pixel_values_images=MultiModalFieldConfig.batched("image"), image_sizes_images=MultiModalFieldConfig.batched("image"), vision_query_lengths_images=MultiModalFieldConfig.batched("image"), @@ -333,6 +333,8 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn vision_query_lengths_videos=MultiModalFieldConfig.batched("video"), ) + return fields + def _build_hcxvision_hf_info( ctx: InputProcessingContext, @@ -590,12 +592,26 @@ class HCXVisionCAbstractor(nn.Module): dummy_inputs=HCXVisionDummyInputsBuilder, ) class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): + """ + HyperCLOVAX-SEED Vision-Language Model (V1 architecture). + + Supports: + - HyperCLOVAX-SEED-Vision-Instruct-3B + + Uses CLIP/SigLIP as the vision encoder with C-Abstractor projector. + """ + packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + ) -> None: super().__init__() # init configs @@ -647,8 +663,9 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.vision_config = vision_config self.text_config = text_config - # use_sum_loss = bool(kwargs.pop("use_sum_loss", False)) - # self.reduction = self._init_reduction_type(use_sum_loss) + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: diff --git a/vllm/model_executor/models/hyperclovax_vision_v2.py b/vllm/model_executor/models/hyperclovax_vision_v2.py new file mode 100644 index 000000000..b32872962 --- /dev/null +++ b/vllm/model_executor/models/hyperclovax_vision_v2.py @@ -0,0 +1,690 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +HyperCLOVAX V2 (32B Think Model) Implementation. + +This module contains the V2 architecture that uses Qwen2.5 Vision Transformer +instead of CLIP/SigLIP used in V1. + +Supports: +- HyperCLOVAX-SEED-Think-32B: Vision + Text +""" + +from collections.abc import Iterable, Mapping, Sequence +from functools import partial +from typing import Annotated, Literal + +import torch +import torch.nn as nn +from transformers import BatchFeature + +from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.forward_context import set_forward_context +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + MultiModalDataDict, + MultiModalFieldConfig, + MultiModalKwargsItems, +) +from vllm.multimodal.parse import ImageSize, MultiModalDataItems +from vllm.multimodal.processing import ( + BaseDummyInputsBuilder, + BaseMultiModalProcessor, + BaseProcessingInfo, + ProcessorInputs, + PromptReplacement, + PromptUpdate, +) +from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape + +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .qwen2_5_vl import Qwen2_5_VisionTransformer +from .utils import ( + AutoWeightsLoader, + WeightsMapper, + init_vllm_registered_model, + maybe_prefix, +) + +# V2 (32B Think model) uses different tokens - retrieved from config at runtime +# These placeholder strings must match the chat template format exactly. +# The chat template produces: <|image_start|><|IMAGE_PAD|><|image_end|> +# Similar to Qwen2-VL's <|vision_start|><|image_pad|><|vision_end|> format. +V2_IMAGE_TOKEN: str = "<|image_start|><|IMAGE_PAD|><|image_end|>" +V2_VIDEO_TOKEN: str = "<|video_start|><|VIDEO_PAD|><|video_end|>" + + +class HCXVisionV2ImagePixelInputs(TensorSchema): + """ + V2 Image inputs using Qwen2.5-VL style grid_thw format. + + Dimensions: + - np: Number of patches + - ni: Number of images + - cps: Number of channels * patch_size * patch_size + """ + + type: Literal["pixel_values"] = "pixel_values" + pixel_values: Annotated[torch.Tensor, TensorShape("np", "cps")] + image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] + + +class HCXVisionV2ImageEmbeddingInputs(TensorSchema): + """ + V2 Image embedding inputs. + + Dimensions: + - nf: Number of image features + - hs: Hidden size + - ni: Number of images + """ + + type: Literal["image_embeds"] = "image_embeds" + image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")] + image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] + + +HCXVisionV2ImageInputs = HCXVisionV2ImagePixelInputs | HCXVisionV2ImageEmbeddingInputs + + +class HCXVisionV2VideoPixelInputs(TensorSchema): + """ + V2 Video inputs using Qwen2.5-VL style grid_thw format. + + Dimensions: + - np: Number of patches + - nv: Number of videos + - ctps: Number of channels * temporal_patch_size * patch_size * patch_size + """ + + type: Literal["pixel_values_videos"] = "pixel_values_videos" + pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctps")] + video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)] + + +class HCXVisionV2VideoEmbeddingInputs(TensorSchema): + """ + V2 Video embedding inputs. + + Dimensions: + - nf: Number of video features + - hs: Hidden size + - nv: Number of videos + """ + + type: Literal["video_embeds"] = "video_embeds" + video_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")] + video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)] + + +HCXVisionV2VideoInputs = HCXVisionV2VideoPixelInputs | HCXVisionV2VideoEmbeddingInputs + + +class HCXVisionV2ProcessingInfo(BaseProcessingInfo): + """Processing info for HyperCLOVAX V2 (32B Think model).""" + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + return {"image": None, "video": None} + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + spatial_merge_size = vision_config.spatial_merge_size + + grid_h = image_height // patch_size + grid_w = image_width // patch_size + + return (grid_h * grid_w) // (spatial_merge_size**2) + + def get_num_video_tokens( + self, + *, + video_width: int, + video_height: int, + num_frames: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + temporal_patch_size = vision_config.temporal_patch_size + spatial_merge_size = vision_config.spatial_merge_size + + grid_t = num_frames // temporal_patch_size + grid_h = video_height // patch_size + grid_w = video_width // patch_size + + return (grid_t * grid_h * grid_w) // (spatial_merge_size**2) + + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + # Use a reasonable default size + size = getattr(vision_config, "image_size", 448) + return ImageSize(width=size, height=size) + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) + + +class HCXVisionV2DummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionV2ProcessingInfo]): + """Dummy inputs builder for HyperCLOVAX V2 memory profiling.""" + + def get_dummy_text( + self, + mm_counts: Mapping[str, int], + ) -> str: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + return V2_IMAGE_TOKEN * num_images + V2_VIDEO_TOKEN * num_videos + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, + ) -> ProcessorInputs: + """Build dummy processor inputs for memory profiling.""" + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + prompt_text = V2_IMAGE_TOKEN * num_images + V2_VIDEO_TOKEN * num_videos + + dummy_mm_data = self.get_dummy_mm_data( + seq_len, + mm_counts, + mm_options, + mm_processor_kwargs=mm_processor_kwargs, + ) + dummy_mm_items = self.info.parse_mm_data(dummy_mm_data, validate=False) + + return ProcessorInputs( + prompt=prompt_text, + mm_data_items=dummy_mm_items, + hf_processor_mm_kwargs=mm_processor_kwargs or {}, + tokenization_kwargs={"truncation": False}, + ) + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + target_width, target_height = self.info.get_image_size_with_most_features() + target_num_frames = 16 # Default for video + + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + + result: MultiModalDataDict = { + "image": self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images, + overrides=image_overrides, # type: ignore + ), + "video": self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=target_num_frames, + num_videos=num_videos, + overrides=video_overrides, # type: ignore + ), + } + + return result + + +class HCXVisionV2MultiModalProcessor( + BaseMultiModalProcessor[HCXVisionV2ProcessingInfo] +): + """Multimodal processor for HyperCLOVAX V2 (32B Think model).""" + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + images = mm_data.get("images") + videos = mm_data.get("videos") + + # Get the HF processor + hf_processor = self.info.get_hf_processor(**mm_kwargs) + + # Build data dict for HF processor (images/videos only) + # NOTE: We pass the prompt as-is without token normalization. + # Token expansion is handled by vLLM via _get_prompt_updates since + # _hf_processor_applies_updates returns False. + data: dict[str, object] = dict( + text=prompt, + images=images, + videos=videos, + ) + + processed_outputs = self.info.ctx.call_hf_processor( + hf_processor=hf_processor, + data=data, + kwargs=dict(**mm_kwargs, **tok_kwargs), + ) + + return processed_outputs + + def _hf_processor_applies_updates( + self, + prompt_text: str, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ) -> bool: + # Match BaseMultiModalProcessor behavior: + # - raw multimodal inputs: HF processor applies updates + # - embedding inputs: vLLM applies updates + return super()._hf_processor_applies_updates( + prompt_text, + mm_items, + hf_processor_mm_kwargs, + tokenization_kwargs, + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + + # Use token IDs directly from config. + # This matches what get_dummy_processor_inputs uses, ensuring consistency. + placeholder: dict[str, int] = { + "image": hf_config.image_token_id, # 128060 for <|IMAGE_PAD|> + "video": hf_config.video_token_id, # 128061 for <|VIDEO_PAD|> + } + + merge_size = hf_config.vision_config.spatial_merge_size + + def get_replacement_v2( + item_idx: int, + modality: str, + out_mm_kwargs: MultiModalKwargsItems, + ): + out_item = out_mm_kwargs[modality][item_idx] + + if modality == "image": + grid_thw_elem = out_item.get("image_grid_thw") + if grid_thw_elem is not None: + # Access .data to get the actual tensor from MultiModalFieldElem + grid_thw = grid_thw_elem.data + # Qwen2.5-VL style calculation + h, w = grid_thw[1].item(), grid_thw[2].item() + num_tokens = (h * w) // (merge_size**2) + else: + # Fallback or error + raise ValueError("Missing image_grid_thw for V2 model") + elif modality == "video": + grid_thw_elem = out_item.get("video_grid_thw") + if grid_thw_elem is not None: + # Access .data to get the actual tensor from MultiModalFieldElem + grid_thw = grid_thw_elem.data + t, h, w = grid_thw[0].item(), grid_thw[1].item(), grid_thw[2].item() + num_tokens = (t * h * w) // (merge_size**2) + else: + raise ValueError("Missing video_grid_thw for V2 model") + else: + raise NotImplementedError(modality) + + return [placeholder[modality]] * num_tokens + + return [ + PromptReplacement( + modality=modality, + target=[ + placeholder[modality], + ], + replacement=partial( + get_replacement_v2, + modality=modality, + out_mm_kwargs=out_mm_kwargs, + ), + ) + for modality in ("image", "video") + ] + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + # HyperCLOVAX V2 uses Qwen2.5-VL style flattened pixel values where + # pixel_values has shape (num_patches, channels*patch_size*patch_size) + # while image_grid_thw has shape (num_images, 3). + # We need to use flat_from_sizes to correctly handle this mismatch. + hf_config = self.info.get_hf_config() + spatial_merge_size = hf_config.vision_config.spatial_merge_size + + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + image_pixel_grid_sizes = image_grid_thw.prod(-1) + image_embed_grid_sizes = ( + image_pixel_grid_sizes // spatial_merge_size // spatial_merge_size + ) + + video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) + video_pixel_grid_sizes = video_grid_thw.prod(-1) + video_embed_grid_sizes = ( + video_pixel_grid_sizes // spatial_merge_size // spatial_merge_size + ) + + return dict( + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_pixel_grid_sizes + ), + image_embeds=MultiModalFieldConfig.flat_from_sizes( + "image", image_embed_grid_sizes + ), + image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True), + pixel_values_videos=MultiModalFieldConfig.flat_from_sizes( + "video", video_pixel_grid_sizes + ), + video_embeds=MultiModalFieldConfig.flat_from_sizes( + "video", video_embed_grid_sizes + ), + video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True), + ) + + +@MULTIMODAL_REGISTRY.register_processor( + HCXVisionV2MultiModalProcessor, + info=HCXVisionV2ProcessingInfo, + dummy_inputs=HCXVisionV2DummyInputsBuilder, +) +class HCXVisionV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): + """ + HyperCLOVAX-SEED Vision-Language Model (V2 architecture). + + Supports: + - HyperCLOVAX-SEED-Think-32B: Vision + Text + + Uses Qwen2.5 Vision Transformer as the vision encoder. + """ + + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + "qkv": ["qkv"], # For vision tower + } + + # Weight mapping for loading HuggingFace checkpoints + # NOTE: Order matters! Ignores (None) should come before renames to prevent + # partial matches + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.": "", # Remove model. prefix if present + "vision_model.": "visual.", # HF uses vision_model, we use visual + }, + orig_to_new_substr={ + # Ignore modules not implemented in vLLM + "discrete_vision_model": None, # TextAlignedTokenizer + }, + ) + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + ) -> None: + super().__init__() + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + # Text config + text_config = config.text_config + if text_config.model_type in ["gpt2", "hyperclovax", "llama"]: + text_config._attn_implementation = "sdpa" + if text_config.model_type != "hyperclovax": + text_config.logits_scaling = 1.0 + + # Vision config + vision_config = config.vision_config + + self.config = config + self.vision_config = vision_config + self.text_config = text_config + self.vllm_config = vllm_config + self.dtype = vllm_config.model_config.dtype + + # Initialize Qwen2.5 Vision Transformer + self.visual = Qwen2_5_VisionTransformer( + vision_config=vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) + + # Linear projector (vision_hidden_size -> text_hidden_size) + # For V2 model: mm_projector_type is "linear" + vision_hidden_size = vision_config.hidden_size + text_hidden_size = text_config.hidden_size + + # Check if out_hidden_size is defined (Qwen2.5-VL style) + # The merger in Qwen2.5 VisionTransformer handles projection to out_hidden_size + if hasattr(vision_config, "out_hidden_size"): + out_hidden = vision_config.out_hidden_size + else: + out_hidden = vision_hidden_size + + # Always create Linear projector since HF checkpoint has mm_projector weights + self.mm_projector = nn.Linear(out_hidden, text_hidden_size) + + # Language model + self.lm_head_vocab_size = getattr( + text_config, "padded_vocab_size", text_config.vocab_size + ) + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + if modality.startswith("image"): + return V2_IMAGE_TOKEN + if modality.startswith("video"): + return V2_VIDEO_TOKEN + + raise ValueError("Only image or video modality is supported") + + def _parse_and_validate_image_input( + self, + **kwargs: object, + ) -> HCXVisionV2ImageInputs | None: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + return HCXVisionV2ImagePixelInputs( + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ) + + if image_embeds is not None: + return HCXVisionV2ImageEmbeddingInputs( + image_embeds=image_embeds, + image_grid_thw=image_grid_thw, + ) + + return None + + def _parse_and_validate_video_input( + self, + **kwargs: object, + ) -> HCXVisionV2VideoInputs | None: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_embeds = kwargs.pop("video_embeds", None) + video_grid_thw = kwargs.pop("video_grid_thw", None) + + if pixel_values_videos is None and video_embeds is None: + return None + + if pixel_values_videos is not None: + return HCXVisionV2VideoPixelInputs( + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + ) + + if video_embeds is not None: + return HCXVisionV2VideoEmbeddingInputs( + video_embeds=video_embeds, + video_grid_thw=video_grid_thw, + ) + + return None + + def _process_image_input( + self, + image_input: HCXVisionV2ImageInputs, + ) -> tuple[torch.Tensor, ...]: + """Process images through Qwen2.5 ViT and projector.""" + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"] + with set_forward_context(None, self.vllm_config): + image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) + + # Apply projector + image_embeds = self.mm_projector(image_embeds) + + # Split concatenated embeddings for each image + merge_size = self.visual.spatial_merge_size + sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist() + return image_embeds.split(sizes) + + def _process_video_input( + self, + video_input: HCXVisionV2VideoInputs, + ) -> tuple[torch.Tensor, ...]: + """Process videos through Qwen2.5 ViT and projector.""" + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"] + with set_forward_context(None, self.vllm_config): + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw_list) + + # Apply projector + video_embeds = self.mm_projector(video_embeds) + + # Split concatenated embeddings for each video + merge_size = self.visual.spatial_merge_size + sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist() + return video_embeds.split(sizes) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + for input_key in kwargs: + if ( + input_key in ("pixel_values", "image_embeds") + and "image" not in modalities + ): + modalities["image"] = self._parse_and_validate_image_input(**kwargs) + if ( + input_key in ("pixel_values_videos", "video_embeds") + and "video" not in modalities + ): + modalities["video"] = self._parse_and_validate_video_input(**kwargs) + + return modalities + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def embed_multimodal( + self, + **kwargs: object, + ) -> MultiModalEmbeddings: + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: + return [] + + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + for modality in modalities: + if modality == "image": + image_input = modalities["image"] + if image_input is not None: + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) + if modality == "video": + video_input = modalities["video"] + if video_input is not None: + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += tuple(video_embeddings) + + return multimodal_embeddings + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: object, + ) -> torch.Tensor | IntermediateTensors: + if intermediate_tensors is not None: + inputs_embeds = None + + hidden_states = self.language_model.model( + input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + return self.language_model.compute_logits(hidden_states) + + def load_weights( + self, + weights: Iterable[tuple[str, torch.Tensor]], + ) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 29ca31875..46437adf4 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -132,6 +132,8 @@ _TEXT_GENERATION_MODELS = { "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"), "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"), "HCXVisionForCausalLM": ("hyperclovax_vision", "HCXVisionForCausalLM"), + "HCXVisionV2ForCausalLM": ("hyperclovax_vision_v2", "HCXVisionV2ForCausalLM"), + "HyperCLOVAXForCausalLM": ("llama", "LlamaForCausalLM"), "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),