[Frontend][Core] Add plumbing to support audio language models (#7446)
This commit is contained in:
@@ -19,12 +19,12 @@ from vllm.sequence import IntermediateTensors, SamplerOutput
|
||||
from .clip import (CLIPVisionModel, dummy_image_for_clip,
|
||||
dummy_seq_data_for_clip, get_max_clip_image_tokens,
|
||||
input_processor_for_clip)
|
||||
from .interfaces import SupportsVision
|
||||
from .interfaces import SupportsMultiModal
|
||||
from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
|
||||
dummy_seq_data_for_siglip, get_max_siglip_image_tokens,
|
||||
input_processor_for_siglip)
|
||||
from .utils import (filter_weights, init_vllm_registered_model,
|
||||
merge_vision_embeddings)
|
||||
merge_multimodal_embeddings)
|
||||
|
||||
|
||||
class LlavaImagePixelInputs(TypedDict):
|
||||
@@ -181,7 +181,7 @@ def _init_vision_tower(hf_config: LlavaConfig):
|
||||
@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
|
||||
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
|
||||
@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
|
||||
class LlavaForConditionalGeneration(nn.Module, SupportsVision):
|
||||
class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
|
||||
def __init__(self,
|
||||
config: LlavaConfig,
|
||||
@@ -338,7 +338,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
|
||||
inputs_embeds = self.language_model.model.get_input_embeddings(
|
||||
input_ids)
|
||||
|
||||
inputs_embeds = merge_vision_embeddings(
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, vision_embeddings,
|
||||
self.config.image_token_index)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user