[Model] Add LFM2-ColBERT-350M support (#37528)
Signed-off-by: Ilya Boytsov <ilyaboytsov1805@gmail.com>
This commit is contained in:
@@ -11,6 +11,7 @@ vLLM supports ColBERT models with multiple encoder backbones:
|
||||
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
|
||||
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
|
||||
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
|
||||
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` |
|
||||
|
||||
**BERT-based ColBERT** models work out of the box:
|
||||
|
||||
@@ -29,6 +30,10 @@ vllm serve lightonai/GTE-ModernColBERT-v1 \
|
||||
vllm serve jinaai/jina-colbert-v2 \
|
||||
--hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
|
||||
--trust-remote-code
|
||||
|
||||
# LFM2 backbone
|
||||
vllm serve LiquidAI/LFM2-ColBERT-350M \
|
||||
--hf-overrides '{"architectures": ["ColBERTLfm2Model"]}'
|
||||
```
|
||||
|
||||
Then you can use the rerank API:
|
||||
|
||||
@@ -39,6 +39,7 @@ Models of any architecture can be converted into embedding models using `--conve
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | |
|
||||
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | |
|
||||
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | |
|
||||
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | |
|
||||
|
||||
@@ -59,6 +59,22 @@ COLBERT_MODELS = {
|
||||
"model_cls": "AutoModel",
|
||||
},
|
||||
},
|
||||
"lfm2": {
|
||||
"model": "LiquidAI/LFM2-ColBERT-350M",
|
||||
"colbert_dim": 128,
|
||||
"max_model_len": 511,
|
||||
"extra_kwargs": {
|
||||
"hf_overrides": {
|
||||
"architectures": ["ColBERTLfm2Model"],
|
||||
},
|
||||
},
|
||||
"hf_comparison": {
|
||||
"weights_file": "1_Dense/model.safetensors",
|
||||
"weights_key": "linear.weight",
|
||||
"trust_remote_code": False,
|
||||
"model_cls": "AutoModel",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -628,6 +628,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]},
|
||||
),
|
||||
"ColBERTLfm2Model": _HfExamplesInfo(
|
||||
"LiquidAI/LFM2-ColBERT-350M",
|
||||
trust_remote_code=True,
|
||||
hf_overrides={"architectures": ["ColBERTLfm2Model"]},
|
||||
),
|
||||
# [Multimodal]
|
||||
"ColModernVBertForRetrieval": _HfExamplesInfo(
|
||||
"ModernVBERT/colmodernvbert-merged",
|
||||
|
||||
@@ -27,8 +27,9 @@ from vllm.model_executor.layers.pooler import Pooler
|
||||
from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
|
||||
|
||||
from .bert import BertEmbeddingModel, BertModel
|
||||
from .interfaces import SupportsLateInteraction
|
||||
from .interfaces import HasInnerState, IsHybrid, SupportsLateInteraction
|
||||
from .interfaces_base import default_pooling_type
|
||||
from .lfm2 import Lfm2ForCausalLM, Lfm2Model
|
||||
|
||||
|
||||
class ColBERTMixin(nn.Module, SupportsLateInteraction):
|
||||
@@ -414,3 +415,98 @@ class ColBERTJinaRobertaModel(ColBERTMixin, nn.Module):
|
||||
loaded.update(colbert_loaded)
|
||||
|
||||
return loaded
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Concrete model: ColBERT + LFM2 backbone
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL")
|
||||
class ColBERTLfm2Model(ColBERTMixin, nn.Module, HasInnerState, IsHybrid):
|
||||
"""ColBERT late interaction model with LFM2 backbone.
|
||||
|
||||
For ``LiquidAI/LFM2-ColBERT-350M`` and similar models.
|
||||
|
||||
The projection is auto-loaded from sentence-transformers ``1_Dense/``
|
||||
when not present in the main checkpoint.
|
||||
"""
|
||||
|
||||
is_pooling_model = True
|
||||
# LFM2 is a hybrid model (attention + SSM layers); these flags ensure
|
||||
# HybridAttentionMambaModelConfig.verify_and_update_config runs so that
|
||||
# mamba_block_size and related cache settings are correctly initialised.
|
||||
is_hybrid = True
|
||||
has_inner_state = True
|
||||
|
||||
@classmethod
|
||||
def get_mamba_state_shape_from_config(cls, vllm_config: VllmConfig):
|
||||
return Lfm2ForCausalLM.get_mamba_state_shape_from_config(vllm_config)
|
||||
|
||||
@classmethod
|
||||
def get_mamba_state_dtype_from_config(cls, vllm_config: VllmConfig):
|
||||
return Lfm2ForCausalLM.get_mamba_state_dtype_from_config(vllm_config)
|
||||
|
||||
@classmethod
|
||||
def get_mamba_state_copy_func(cls):
|
||||
return Lfm2ForCausalLM.get_mamba_state_copy_func()
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
|
||||
colbert_dim = self.get_colbert_dim_from_config(config)
|
||||
self._init_colbert_components(
|
||||
hidden_size=config.hidden_size,
|
||||
colbert_dim=colbert_dim,
|
||||
head_dtype=vllm_config.model_config.head_dtype,
|
||||
)
|
||||
|
||||
self.model = Lfm2Model(
|
||||
vllm_config=vllm_config,
|
||||
prefix=prefix,
|
||||
)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
self.pooler = self._build_colbert_pooler(pooler_config)
|
||||
|
||||
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
return self.model.embed_input_ids(input_ids)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors=None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
return self.model(
|
||||
input_ids=input_ids,
|
||||
positions=positions,
|
||||
inputs_embeds=inputs_embeds,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
other_weights, colbert_loaded = self._load_colbert_weights(weights)
|
||||
|
||||
# Strip "model." prefix added by the embedding adapter
|
||||
model_weights = [
|
||||
(n[len("model.") :] if n.startswith("model.") else n, w)
|
||||
for n, w in other_weights
|
||||
]
|
||||
loaded_model = self.model.load_weights(model_weights)
|
||||
loaded = {f"model.{name}" for name in loaded_model} | colbert_loaded
|
||||
|
||||
# When the ST projector was auto-loaded during init
|
||||
# (not from the main checkpoint), mark its params as loaded
|
||||
# so the weight validator doesn't complain.
|
||||
if hasattr(self.pooler, "head"):
|
||||
head = self.pooler.head
|
||||
projector = getattr(head, "projector", None)
|
||||
if projector is not None and isinstance(projector, nn.Module):
|
||||
for name, _ in projector.named_parameters():
|
||||
loaded.add(f"pooler.head.projector.{name}")
|
||||
|
||||
return loaded
|
||||
|
||||
@@ -269,6 +269,7 @@ _LATE_INTERACTION_MODELS = {
|
||||
"HF_ColBERT": ("colbert", "ColBERTModel"),
|
||||
"ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"),
|
||||
"ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"),
|
||||
"ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"),
|
||||
# [Multimodal]
|
||||
"ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"),
|
||||
"ColPaliForRetrieval": ("colpali", "ColPaliModel"),
|
||||
|
||||
Reference in New Issue
Block a user