[Frontend][Core] Add plumbing to support audio language models (#7446)
This commit is contained in:
@@ -40,8 +40,8 @@ from vllm.multimodal.image import (cached_get_image_processor,
|
||||
cached_get_tokenizer)
|
||||
from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
|
||||
|
||||
from .interfaces import SupportsVision
|
||||
from .utils import merge_vision_embeddings
|
||||
from .interfaces import SupportsMultiModal
|
||||
from .utils import merge_multimodal_embeddings
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -209,7 +209,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
|
||||
@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
|
||||
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu)
|
||||
@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
|
||||
class FuyuForCausalLM(nn.Module, SupportsVision):
|
||||
class FuyuForCausalLM(nn.Module, SupportsMultiModal):
|
||||
|
||||
def __init__(self,
|
||||
config: FuyuConfig,
|
||||
@@ -271,9 +271,9 @@ class FuyuForCausalLM(nn.Module, SupportsVision):
|
||||
if image_input is not None:
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
inputs_embeds = self.language_model.model.embed_tokens(input_ids)
|
||||
inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
|
||||
vision_embeddings,
|
||||
self.image_token_id)
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, vision_embeddings,
|
||||
self.image_token_id)
|
||||
|
||||
else:
|
||||
inputs_embeds = None
|
||||
|
||||
Reference in New Issue
Block a user