[Model] Use merge_by_field_config for MM models (Llava family) (#26280)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -64,7 +64,7 @@ from vllm.transformers_utils.tokenizer import (
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
from .utils import flatten_bn, init_vllm_registered_model, maybe_prefix
|
||||
from .utils import init_vllm_registered_model, maybe_prefix
|
||||
from .vision import (
|
||||
VisionEncoderInfo,
|
||||
VisionFeatureSelectStrategy,
|
||||
@@ -365,6 +365,8 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo])
|
||||
dummy_inputs=PixtralDummyInputsBuilder,
|
||||
)
|
||||
class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
merge_by_field_config = True
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||
if modality.startswith("image"):
|
||||
@@ -424,7 +426,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
|
||||
|
||||
return PixtralImagePixelInputs(
|
||||
type="pixel_values",
|
||||
images=flatten_bn(images),
|
||||
images=images,
|
||||
)
|
||||
|
||||
def _process_image_input(
|
||||
|
||||
Reference in New Issue
Block a user