[Models] Add remaining model PP support (#7168)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
committed by
GitHub
parent
303d44790a
commit
0f6d7a9a34
@@ -1,4 +1,5 @@
|
||||
import math
|
||||
from functools import cached_property
|
||||
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
|
||||
TypedDict, Union)
|
||||
|
||||
@@ -17,9 +18,8 @@ from vllm.config import CacheConfig, MultiModalConfig
|
||||
from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
@@ -31,7 +31,7 @@ from vllm.utils import is_list_of
|
||||
from .clip import (CLIPVisionModel, dummy_seq_data_for_clip,
|
||||
dummy_video_for_clip, get_clip_image_feature_size,
|
||||
get_clip_patch_grid_length, input_processor_for_clip)
|
||||
from .interfaces import SupportsMultiModal
|
||||
from .interfaces import SupportsMultiModal, SupportsPP
|
||||
from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip,
|
||||
dummy_video_for_siglip, get_siglip_image_feature_size,
|
||||
get_siglip_patch_grid_length, input_processor_for_siglip)
|
||||
@@ -414,7 +414,8 @@ class LlavaOnevisionMultiModalProjector(nn.Module):
|
||||
"video", get_max_llava_onevision_video_tokens)
|
||||
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_onevision)
|
||||
@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_onevision)
|
||||
class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
SupportsPP):
|
||||
|
||||
def __init__(self,
|
||||
config: LlavaOnevisionConfig,
|
||||
@@ -434,6 +435,16 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
self.image_newline = nn.Parameter(
|
||||
torch.empty(config.text_config.hidden_size))
|
||||
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.language_model.model.make_empty_intermediate_tensors)
|
||||
|
||||
@cached_property
|
||||
def sampler(self):
|
||||
if hasattr(self.language_model, "sampler"):
|
||||
return self.language_model.sampler
|
||||
|
||||
return Sampler()
|
||||
|
||||
def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
|
||||
expected_dims = (2, )
|
||||
|
||||
@@ -805,39 +816,42 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
attn_metadata: AttentionMetadata,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
**kwargs: object,
|
||||
) -> SamplerOutput:
|
||||
) -> Union[torch.Tensor, IntermediateTensors]:
|
||||
"""Run forward pass for LlaVA-Onevision.
|
||||
Args:
|
||||
input_ids: Flattened (concatenated) input_ids corresponding to a
|
||||
batch.
|
||||
pixel_values_videos: Pixels in each frames for each input videos.
|
||||
"""
|
||||
modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
|
||||
# merge video embeddings into input embeddings
|
||||
if modalities:
|
||||
inputs_embeds = self.language_model.model.get_input_embeddings(
|
||||
input_ids)
|
||||
if "images" in modalities:
|
||||
image_input = modalities["images"]
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, vision_embeddings,
|
||||
self.config.image_token_index)
|
||||
if "videos" in modalities:
|
||||
video_input = modalities["videos"]
|
||||
video_embeddings = self._process_video_pixels(video_input)
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, video_embeddings,
|
||||
self.config.video_token_index)
|
||||
if intermediate_tensors is not None:
|
||||
input_ids = None
|
||||
else:
|
||||
inputs_embeds = None
|
||||
else:
|
||||
modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
|
||||
if modalities:
|
||||
inputs_embeds = self.language_model.model.get_input_embeddings(
|
||||
input_ids)
|
||||
if "images" in modalities:
|
||||
image_input = modalities["images"]
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, vision_embeddings,
|
||||
self.config.image_token_index)
|
||||
if "videos" in modalities:
|
||||
video_input = modalities["videos"]
|
||||
video_embeddings = self._process_video_pixels(video_input)
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, video_embeddings,
|
||||
self.config.video_token_index)
|
||||
input_ids = None
|
||||
else:
|
||||
inputs_embeds = None
|
||||
|
||||
hidden_states = self.language_model.model(input_ids,
|
||||
positions,
|
||||
kv_caches,
|
||||
attn_metadata,
|
||||
None,
|
||||
intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds)
|
||||
|
||||
return hidden_states
|
||||
|
||||
Reference in New Issue
Block a user