[Frontend][Core] Add plumbing to support audio language models (#7446)

This commit is contained in:
Peter Salas
2024-08-13 10:39:33 -07:00
committed by GitHub
parent e20233d361
commit 00c3d68e45
24 changed files with 599 additions and 120 deletions

View File

@@ -19,12 +19,12 @@ from vllm.sequence import IntermediateTensors, SamplerOutput
from .clip import (CLIPVisionModel, dummy_image_for_clip,
dummy_seq_data_for_clip, get_max_clip_image_tokens,
input_processor_for_clip)
from .interfaces import SupportsVision
from .interfaces import SupportsMultiModal
from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
dummy_seq_data_for_siglip, get_max_siglip_image_tokens,
input_processor_for_siglip)
from .utils import (filter_weights, init_vllm_registered_model,
merge_vision_embeddings)
merge_multimodal_embeddings)
class LlavaImagePixelInputs(TypedDict):
@@ -181,7 +181,7 @@ def _init_vision_tower(hf_config: LlavaConfig):
@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
class LlavaForConditionalGeneration(nn.Module, SupportsVision):
class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal):
def __init__(self,
config: LlavaConfig,
@@ -338,7 +338,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
inputs_embeds = self.language_model.model.get_input_embeddings(
input_ids)
inputs_embeds = merge_vision_embeddings(
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, vision_embeddings,
self.config.image_token_index)