[VLM] Simplify post-processing of replacement info (#12269)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -342,13 +342,7 @@ class AriaProcessingInfo(BaseProcessingInfo):
|
||||
return self.get_hf_config().vision_config
|
||||
|
||||
def get_hf_processor(self):
|
||||
processor = self.ctx.get_hf_processor(AriaProcessor)
|
||||
|
||||
# Patch for https://github.com/huggingface/transformers/issues/35768
|
||||
processor.tokenizer.image_token = "<|img|>"
|
||||
processor.image_token = "<|img|>"
|
||||
|
||||
return processor
|
||||
return self.ctx.get_hf_processor(AriaProcessor)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
@@ -381,7 +375,7 @@ class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
|
||||
}
|
||||
|
||||
hf_processor = self.info.get_hf_processor()
|
||||
image_token: str = hf_processor.image_token # type: ignore
|
||||
image_token: str = hf_processor.tokenizer.image_token # type: ignore
|
||||
|
||||
return ProcessorInputs(
|
||||
prompt_text=image_token * num_images,
|
||||
|
||||
@@ -14,12 +14,12 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, MultiModalKwargs,
|
||||
NestedTensors, PlaceholderRange)
|
||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement)
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
PromptReplacementDetails)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@@ -481,30 +481,13 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target="</s>",
|
||||
replacement="<image>" * num_image_tokens + "</s>",
|
||||
replacement=PromptReplacementDetails(
|
||||
full="<image>" * num_image_tokens + "</s>",
|
||||
features="<image>" * num_image_tokens,
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> MultiModalInputs:
|
||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Only <image> tokens should be considered as placeholders,
|
||||
# so we ignore the trailing bos_token
|
||||
result["mm_placeholders"] = {
|
||||
modality: [
|
||||
PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
|
||||
for p in ps
|
||||
]
|
||||
for modality, ps in result["mm_placeholders"].items()
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
|
||||
info=Blip2ProcessingInfo,
|
||||
|
||||
@@ -28,12 +28,12 @@ from vllm.model_executor.model_loader.weight_utils import (
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, MultiModalKwargs,
|
||||
NestedTensors, PlaceholderRange)
|
||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement)
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
PromptReplacementDetails)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@@ -141,39 +141,23 @@ class ChameleonMultiModalProcessor(
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> list[PromptReplacement]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_tokens = processor.image_token * self.info.get_num_image_tokens()
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target="<image>",
|
||||
replacement="".join([
|
||||
processor.image_start_token,
|
||||
processor.image_token * self.info.get_num_image_tokens(),
|
||||
processor.image_end_token,
|
||||
]),
|
||||
replacement=PromptReplacementDetails(
|
||||
full="".join([
|
||||
processor.image_start_token,
|
||||
image_tokens,
|
||||
processor.image_end_token,
|
||||
]),
|
||||
features=image_tokens,
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> MultiModalInputs:
|
||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Only <image> tokens should be considered as placeholders,
|
||||
# so we ignore the image_start_token and image_end_token
|
||||
result["mm_placeholders"] = {
|
||||
modality: [
|
||||
PlaceholderRange(offset=p["offset"] + 1,
|
||||
length=p["length"] - 2) for p in ps
|
||||
]
|
||||
for modality, ps in result["mm_placeholders"].items()
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class ChameleonLayerNorm(nn.LayerNorm):
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
""" PyTorch Fuyu model."""
|
||||
import math
|
||||
from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
|
||||
TypedDict, Union)
|
||||
TypedDict)
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -30,13 +30,13 @@ from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, MultiModalKwargs,
|
||||
NestedTensors, PlaceholderRange)
|
||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement)
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
PromptReplacementDetails)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@@ -215,9 +215,13 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
|
||||
[bos_token_id])
|
||||
return PromptReplacementDetails(
|
||||
full=image_tokens + [bos_token_id],
|
||||
features=image_tokens,
|
||||
)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
@@ -227,26 +231,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
|
||||
)
|
||||
]
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> MultiModalInputs:
|
||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Only |SPEAKER| (image) tokens should be considered as placeholders,
|
||||
# so we ignore the trailing bos_token_id
|
||||
result["mm_placeholders"] = {
|
||||
modality: [
|
||||
PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
|
||||
for p in ps
|
||||
]
|
||||
for modality, ps in result["mm_placeholders"].items()
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
|
||||
info=FuyuProcessingInfo,
|
||||
|
||||
@@ -30,15 +30,19 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, MultiModalKwargs,
|
||||
NestedTensors, PlaceholderRange)
|
||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
ImageSize, MultiModalDataItems)
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
BoundPromptReplacement,
|
||||
PlaceholderInfo, PromptReplacement)
|
||||
PlaceholderFeaturesInfo,
|
||||
PromptReplacement,
|
||||
PromptReplacementDetails)
|
||||
# yapf: enable
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import is_list_of
|
||||
@@ -437,7 +441,12 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
||||
processor=hf_processor,
|
||||
)
|
||||
|
||||
return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id]
|
||||
image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
|
||||
|
||||
return PromptReplacementDetails(
|
||||
full=image_tokens + [bos_token_id],
|
||||
features=image_tokens,
|
||||
)
|
||||
|
||||
num_images = mm_items.get_count("image", strict=False)
|
||||
|
||||
@@ -454,7 +463,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
||||
token_ids: list[int],
|
||||
mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
|
||||
mm_item_counts: Mapping[str, int],
|
||||
) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
|
||||
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||
token_ids, text, placeholders = super()._apply_prompt_replacements(
|
||||
token_ids=token_ids,
|
||||
mm_prompt_repls=mm_prompt_repls,
|
||||
@@ -467,11 +476,11 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
||||
token_ids = [token_ids[0], *token_ids[2:]]
|
||||
placeholders = {
|
||||
modality: [
|
||||
PlaceholderInfo(
|
||||
PlaceholderFeaturesInfo(
|
||||
modality=p.modality,
|
||||
item_idx=p.item_idx,
|
||||
start_idx=p.start_idx - 1,
|
||||
replacement=p.replacement,
|
||||
tokens=p.tokens,
|
||||
) for p in ps
|
||||
]
|
||||
for modality, ps in placeholders.items()
|
||||
@@ -479,26 +488,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
||||
|
||||
return token_ids, text, placeholders
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> MultiModalInputs:
|
||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Only <|image|> tokens should be considered as placeholders,
|
||||
# so we ignore the trailing bos_token_id
|
||||
result["mm_placeholders"] = {
|
||||
modality: [
|
||||
PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
|
||||
for p in ps
|
||||
]
|
||||
for modality, ps in result["mm_placeholders"].items()
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
|
||||
info=Phi3VProcessingInfo,
|
||||
|
||||
@@ -36,13 +36,13 @@ from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, MultiModalKwargs,
|
||||
NestedTensors, PlaceholderRange)
|
||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement)
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
PromptReplacementDetails)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@@ -216,11 +216,16 @@ class Qwen2AudioMultiModalProcessor(
|
||||
f"The audio {audio} (len={len(audio)}) is too short "
|
||||
"to be represented inside the model")
|
||||
|
||||
return "".join([
|
||||
audio_bos_token,
|
||||
audio_token * num_placeholders,
|
||||
audio_eos_token,
|
||||
])
|
||||
audio_tokens = audio_token * num_placeholders
|
||||
|
||||
return PromptReplacementDetails(
|
||||
full="".join([
|
||||
audio_bos_token,
|
||||
audio_tokens,
|
||||
audio_eos_token,
|
||||
]),
|
||||
features=audio_tokens,
|
||||
)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
@@ -240,26 +245,6 @@ class Qwen2AudioMultiModalProcessor(
|
||||
# tokens than the number of audio items)
|
||||
return not hasattr(self.info.get_hf_processor(), "audio_token")
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> MultiModalInputs:
|
||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Only <|AUDIO|> tokens should be considered as placeholders,
|
||||
# so we ignore the audio_bos_token and audio_eos_token
|
||||
result["mm_placeholders"] = {
|
||||
modality: [
|
||||
PlaceholderRange(offset=p["offset"] + 1,
|
||||
length=p["length"] - 2) for p in ps
|
||||
]
|
||||
for modality, ps in result["mm_placeholders"].items()
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
Qwen2AudioMultiModalProcessor,
|
||||
|
||||
Reference in New Issue
Block a user