[Model] Enable LoRA support for BLIP2 (#31620)
Signed-off-by: Qiping Pan <panqiping@outlook.com>
This commit is contained in:
@@ -673,7 +673,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
|||||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
|
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
|
||||||
| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
|
| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
|
||||||
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
|
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
|
||||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
|
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ |
|
||||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
|
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
|
||||||
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
|
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
|
||||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
|
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
|
||||||
|
|||||||
@@ -35,13 +35,15 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
|||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|
||||||
from .blip import BlipVisionModel
|
from .blip import BlipVisionModel, get_blip_num_patches
|
||||||
from .interfaces import (
|
from .interfaces import (
|
||||||
MultiModalEmbeddings,
|
MultiModalEmbeddings,
|
||||||
|
SupportsLoRA,
|
||||||
SupportsMultiModal,
|
SupportsMultiModal,
|
||||||
SupportsPP,
|
SupportsPP,
|
||||||
SupportsQuant,
|
SupportsQuant,
|
||||||
)
|
)
|
||||||
|
from .module_mapping import MultiModelKeys
|
||||||
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
|
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
|
||||||
|
|
||||||
|
|
||||||
@@ -521,7 +523,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
|
|||||||
dummy_inputs=Blip2DummyInputsBuilder,
|
dummy_inputs=Blip2DummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class Blip2ForConditionalGeneration(
|
class Blip2ForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
|
nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP, SupportsQuant
|
||||||
):
|
):
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
@@ -538,9 +540,17 @@ class Blip2ForConditionalGeneration(
|
|||||||
multimodal_config = vllm_config.model_config.multimodal_config
|
multimodal_config = vllm_config.model_config.multimodal_config
|
||||||
self.config = config
|
self.config = config
|
||||||
self.multimodal_config = multimodal_config
|
self.multimodal_config = multimodal_config
|
||||||
|
vision_config = config.vision_config
|
||||||
|
self._vision_tokens_per_image = (
|
||||||
|
get_blip_num_patches(
|
||||||
|
image_size=vision_config.image_size,
|
||||||
|
patch_size=vision_config.patch_size,
|
||||||
|
)
|
||||||
|
+ 1 # include class token
|
||||||
|
)
|
||||||
|
|
||||||
# TODO: Optionally initializes this for supporting embeddings.
|
# TODO: Optionally initializes this for supporting embeddings.
|
||||||
self.vision_model = BlipVisionModel(config.vision_config, quant_config)
|
self.vision_model = BlipVisionModel(vision_config, quant_config)
|
||||||
|
|
||||||
self.query_tokens = nn.Parameter(
|
self.query_tokens = nn.Parameter(
|
||||||
torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)
|
torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)
|
||||||
@@ -691,3 +701,36 @@ class Blip2ForConditionalGeneration(
|
|||||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||||
loader = AutoWeightsLoader(self)
|
loader = AutoWeightsLoader(self)
|
||||||
return loader.load_weights(weights)
|
return loader.load_weights(weights)
|
||||||
|
|
||||||
|
def get_mm_mapping(self) -> MultiModelKeys:
|
||||||
|
return MultiModelKeys.from_string_field(
|
||||||
|
language_model="language_model",
|
||||||
|
connector=["qformer", "language_projection"],
|
||||||
|
tower_model="vision_model",
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_num_mm_encoder_tokens(
|
||||||
|
self,
|
||||||
|
num_image_tokens: int,
|
||||||
|
) -> int:
|
||||||
|
if num_image_tokens <= 0:
|
||||||
|
return 0
|
||||||
|
assert num_image_tokens % self.config.num_query_tokens == 0, (
|
||||||
|
"The number of image tokens must be a multiple of "
|
||||||
|
"the number of query tokens."
|
||||||
|
)
|
||||||
|
num_images = num_image_tokens / self.config.num_query_tokens
|
||||||
|
return num_images * self._vision_tokens_per_image
|
||||||
|
|
||||||
|
def get_num_mm_connector_tokens(
|
||||||
|
self,
|
||||||
|
num_vision_tokens: int,
|
||||||
|
) -> int:
|
||||||
|
if num_vision_tokens <= 0:
|
||||||
|
return 0
|
||||||
|
assert num_vision_tokens % self._vision_tokens_per_image == 0, (
|
||||||
|
"The number of vision tokens must be a multiple of "
|
||||||
|
"the number of tokens per image."
|
||||||
|
)
|
||||||
|
num_images = num_vision_tokens / self._vision_tokens_per_image
|
||||||
|
return num_images * self.config.num_query_tokens
|
||||||
|
|||||||
Reference in New Issue
Block a user