[Model] Add LFM2-ColBERT-350M support (#37528)
Signed-off-by: Ilya Boytsov <ilyaboytsov1805@gmail.com>
This commit is contained in:
@@ -11,6 +11,7 @@ vLLM supports ColBERT models with multiple encoder backbones:
|
|||||||
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
|
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
|
||||||
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
|
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
|
||||||
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
|
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
|
||||||
|
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` |
|
||||||
|
|
||||||
**BERT-based ColBERT** models work out of the box:
|
**BERT-based ColBERT** models work out of the box:
|
||||||
|
|
||||||
@@ -29,6 +30,10 @@ vllm serve lightonai/GTE-ModernColBERT-v1 \
|
|||||||
vllm serve jinaai/jina-colbert-v2 \
|
vllm serve jinaai/jina-colbert-v2 \
|
||||||
--hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
|
--hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
|
||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
|
|
||||||
|
# LFM2 backbone
|
||||||
|
vllm serve LiquidAI/LFM2-ColBERT-350M \
|
||||||
|
--hf-overrides '{"architectures": ["ColBERTLfm2Model"]}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Then you can use the rerank API:
|
Then you can use the rerank API:
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ Models of any architecture can be converted into embedding models using `--conve
|
|||||||
|
|
||||||
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||||
|
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | |
|
||||||
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | |
|
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | |
|
||||||
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | |
|
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | |
|
||||||
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | |
|
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | |
|
||||||
|
|||||||
@@ -59,6 +59,22 @@ COLBERT_MODELS = {
|
|||||||
"model_cls": "AutoModel",
|
"model_cls": "AutoModel",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"lfm2": {
|
||||||
|
"model": "LiquidAI/LFM2-ColBERT-350M",
|
||||||
|
"colbert_dim": 128,
|
||||||
|
"max_model_len": 511,
|
||||||
|
"extra_kwargs": {
|
||||||
|
"hf_overrides": {
|
||||||
|
"architectures": ["ColBERTLfm2Model"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"hf_comparison": {
|
||||||
|
"weights_file": "1_Dense/model.safetensors",
|
||||||
|
"weights_key": "linear.weight",
|
||||||
|
"trust_remote_code": False,
|
||||||
|
"model_cls": "AutoModel",
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -628,6 +628,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
|
|||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]},
|
hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]},
|
||||||
),
|
),
|
||||||
|
"ColBERTLfm2Model": _HfExamplesInfo(
|
||||||
|
"LiquidAI/LFM2-ColBERT-350M",
|
||||||
|
trust_remote_code=True,
|
||||||
|
hf_overrides={"architectures": ["ColBERTLfm2Model"]},
|
||||||
|
),
|
||||||
# [Multimodal]
|
# [Multimodal]
|
||||||
"ColModernVBertForRetrieval": _HfExamplesInfo(
|
"ColModernVBertForRetrieval": _HfExamplesInfo(
|
||||||
"ModernVBERT/colmodernvbert-merged",
|
"ModernVBERT/colmodernvbert-merged",
|
||||||
|
|||||||
@@ -27,8 +27,9 @@ from vllm.model_executor.layers.pooler import Pooler
|
|||||||
from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
|
from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
|
||||||
|
|
||||||
from .bert import BertEmbeddingModel, BertModel
|
from .bert import BertEmbeddingModel, BertModel
|
||||||
from .interfaces import SupportsLateInteraction
|
from .interfaces import HasInnerState, IsHybrid, SupportsLateInteraction
|
||||||
from .interfaces_base import default_pooling_type
|
from .interfaces_base import default_pooling_type
|
||||||
|
from .lfm2 import Lfm2ForCausalLM, Lfm2Model
|
||||||
|
|
||||||
|
|
||||||
class ColBERTMixin(nn.Module, SupportsLateInteraction):
|
class ColBERTMixin(nn.Module, SupportsLateInteraction):
|
||||||
@@ -414,3 +415,98 @@ class ColBERTJinaRobertaModel(ColBERTMixin, nn.Module):
|
|||||||
loaded.update(colbert_loaded)
|
loaded.update(colbert_loaded)
|
||||||
|
|
||||||
return loaded
|
return loaded
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------
|
||||||
|
# Concrete model: ColBERT + LFM2 backbone
|
||||||
|
# -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL")
|
||||||
|
class ColBERTLfm2Model(ColBERTMixin, nn.Module, HasInnerState, IsHybrid):
|
||||||
|
"""ColBERT late interaction model with LFM2 backbone.
|
||||||
|
|
||||||
|
For ``LiquidAI/LFM2-ColBERT-350M`` and similar models.
|
||||||
|
|
||||||
|
The projection is auto-loaded from sentence-transformers ``1_Dense/``
|
||||||
|
when not present in the main checkpoint.
|
||||||
|
"""
|
||||||
|
|
||||||
|
is_pooling_model = True
|
||||||
|
# LFM2 is a hybrid model (attention + SSM layers); these flags ensure
|
||||||
|
# HybridAttentionMambaModelConfig.verify_and_update_config runs so that
|
||||||
|
# mamba_block_size and related cache settings are correctly initialised.
|
||||||
|
is_hybrid = True
|
||||||
|
has_inner_state = True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_mamba_state_shape_from_config(cls, vllm_config: VllmConfig):
|
||||||
|
return Lfm2ForCausalLM.get_mamba_state_shape_from_config(vllm_config)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_mamba_state_dtype_from_config(cls, vllm_config: VllmConfig):
|
||||||
|
return Lfm2ForCausalLM.get_mamba_state_dtype_from_config(vllm_config)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_mamba_state_copy_func(cls):
|
||||||
|
return Lfm2ForCausalLM.get_mamba_state_copy_func()
|
||||||
|
|
||||||
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
|
super().__init__()
|
||||||
|
config = vllm_config.model_config.hf_config
|
||||||
|
|
||||||
|
colbert_dim = self.get_colbert_dim_from_config(config)
|
||||||
|
self._init_colbert_components(
|
||||||
|
hidden_size=config.hidden_size,
|
||||||
|
colbert_dim=colbert_dim,
|
||||||
|
head_dtype=vllm_config.model_config.head_dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.model = Lfm2Model(
|
||||||
|
vllm_config=vllm_config,
|
||||||
|
prefix=prefix,
|
||||||
|
)
|
||||||
|
|
||||||
|
pooler_config = vllm_config.model_config.pooler_config
|
||||||
|
assert pooler_config is not None
|
||||||
|
self.pooler = self._build_colbert_pooler(pooler_config)
|
||||||
|
|
||||||
|
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||||
|
return self.model.embed_input_ids(input_ids)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
intermediate_tensors=None,
|
||||||
|
inputs_embeds: torch.Tensor | None = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
return self.model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
positions=positions,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
intermediate_tensors=intermediate_tensors,
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||||
|
other_weights, colbert_loaded = self._load_colbert_weights(weights)
|
||||||
|
|
||||||
|
# Strip "model." prefix added by the embedding adapter
|
||||||
|
model_weights = [
|
||||||
|
(n[len("model.") :] if n.startswith("model.") else n, w)
|
||||||
|
for n, w in other_weights
|
||||||
|
]
|
||||||
|
loaded_model = self.model.load_weights(model_weights)
|
||||||
|
loaded = {f"model.{name}" for name in loaded_model} | colbert_loaded
|
||||||
|
|
||||||
|
# When the ST projector was auto-loaded during init
|
||||||
|
# (not from the main checkpoint), mark its params as loaded
|
||||||
|
# so the weight validator doesn't complain.
|
||||||
|
if hasattr(self.pooler, "head"):
|
||||||
|
head = self.pooler.head
|
||||||
|
projector = getattr(head, "projector", None)
|
||||||
|
if projector is not None and isinstance(projector, nn.Module):
|
||||||
|
for name, _ in projector.named_parameters():
|
||||||
|
loaded.add(f"pooler.head.projector.{name}")
|
||||||
|
|
||||||
|
return loaded
|
||||||
|
|||||||
@@ -269,6 +269,7 @@ _LATE_INTERACTION_MODELS = {
|
|||||||
"HF_ColBERT": ("colbert", "ColBERTModel"),
|
"HF_ColBERT": ("colbert", "ColBERTModel"),
|
||||||
"ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"),
|
"ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"),
|
||||||
"ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"),
|
"ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"),
|
||||||
|
"ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"),
|
||||||
# [Multimodal]
|
# [Multimodal]
|
||||||
"ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"),
|
"ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"),
|
||||||
"ColPaliForRetrieval": ("colpali", "ColPaliModel"),
|
"ColPaliForRetrieval": ("colpali", "ColPaliModel"),
|
||||||
|
|||||||
Reference in New Issue
Block a user