[Model] Add LFM2-ColBERT-350M support (#37528)

Signed-off-by: Ilya Boytsov <ilyaboytsov1805@gmail.com>
This commit is contained in:
Ilya Boytsov
2026-03-20 15:57:57 +01:00
committed by GitHub
parent 9f6d9dd371
commit 8b6c6b9505
6 changed files with 125 additions and 1 deletions

View File

@@ -11,6 +11,7 @@ vLLM supports ColBERT models with multiple encoder backbones:
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` |
**BERT-based ColBERT** models work out of the box:
@@ -29,6 +30,10 @@ vllm serve lightonai/GTE-ModernColBERT-v1 \
vllm serve jinaai/jina-colbert-v2 \
--hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
--trust-remote-code
# LFM2 backbone
vllm serve LiquidAI/LFM2-ColBERT-350M \
--hf-overrides '{"architectures": ["ColBERTLfm2Model"]}'
```
Then you can use the rerank API:

View File

@@ -39,6 +39,7 @@ Models of any architecture can be converted into embedding models using `--conve
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | |
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | |
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | |
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | |

View File

@@ -59,6 +59,22 @@ COLBERT_MODELS = {
"model_cls": "AutoModel",
},
},
"lfm2": {
"model": "LiquidAI/LFM2-ColBERT-350M",
"colbert_dim": 128,
"max_model_len": 511,
"extra_kwargs": {
"hf_overrides": {
"architectures": ["ColBERTLfm2Model"],
},
},
"hf_comparison": {
"weights_file": "1_Dense/model.safetensors",
"weights_key": "linear.weight",
"trust_remote_code": False,
"model_cls": "AutoModel",
},
},
}

View File

@@ -628,6 +628,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
trust_remote_code=True,
hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]},
),
"ColBERTLfm2Model": _HfExamplesInfo(
"LiquidAI/LFM2-ColBERT-350M",
trust_remote_code=True,
hf_overrides={"architectures": ["ColBERTLfm2Model"]},
),
# [Multimodal]
"ColModernVBertForRetrieval": _HfExamplesInfo(
"ModernVBERT/colmodernvbert-merged",

View File

@@ -27,8 +27,9 @@ from vllm.model_executor.layers.pooler import Pooler
from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
from .bert import BertEmbeddingModel, BertModel
from .interfaces import SupportsLateInteraction
from .interfaces import HasInnerState, IsHybrid, SupportsLateInteraction
from .interfaces_base import default_pooling_type
from .lfm2 import Lfm2ForCausalLM, Lfm2Model
class ColBERTMixin(nn.Module, SupportsLateInteraction):
@@ -414,3 +415,98 @@ class ColBERTJinaRobertaModel(ColBERTMixin, nn.Module):
loaded.update(colbert_loaded)
return loaded
# -----------------------------------------------------------------------
# Concrete model: ColBERT + LFM2 backbone
# -----------------------------------------------------------------------
@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL")
class ColBERTLfm2Model(ColBERTMixin, nn.Module, HasInnerState, IsHybrid):
"""ColBERT late interaction model with LFM2 backbone.
For ``LiquidAI/LFM2-ColBERT-350M`` and similar models.
The projection is auto-loaded from sentence-transformers ``1_Dense/``
when not present in the main checkpoint.
"""
is_pooling_model = True
# LFM2 is a hybrid model (attention + SSM layers); these flags ensure
# HybridAttentionMambaModelConfig.verify_and_update_config runs so that
# mamba_block_size and related cache settings are correctly initialised.
is_hybrid = True
has_inner_state = True
@classmethod
def get_mamba_state_shape_from_config(cls, vllm_config: VllmConfig):
return Lfm2ForCausalLM.get_mamba_state_shape_from_config(vllm_config)
@classmethod
def get_mamba_state_dtype_from_config(cls, vllm_config: VllmConfig):
return Lfm2ForCausalLM.get_mamba_state_dtype_from_config(vllm_config)
@classmethod
def get_mamba_state_copy_func(cls):
return Lfm2ForCausalLM.get_mamba_state_copy_func()
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
colbert_dim = self.get_colbert_dim_from_config(config)
self._init_colbert_components(
hidden_size=config.hidden_size,
colbert_dim=colbert_dim,
head_dtype=vllm_config.model_config.head_dtype,
)
self.model = Lfm2Model(
vllm_config=vllm_config,
prefix=prefix,
)
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = self._build_colbert_pooler(pooler_config)
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.embed_input_ids(input_ids)
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors=None,
inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor:
return self.model(
input_ids=input_ids,
positions=positions,
inputs_embeds=inputs_embeds,
intermediate_tensors=intermediate_tensors,
)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
other_weights, colbert_loaded = self._load_colbert_weights(weights)
# Strip "model." prefix added by the embedding adapter
model_weights = [
(n[len("model.") :] if n.startswith("model.") else n, w)
for n, w in other_weights
]
loaded_model = self.model.load_weights(model_weights)
loaded = {f"model.{name}" for name in loaded_model} | colbert_loaded
# When the ST projector was auto-loaded during init
# (not from the main checkpoint), mark its params as loaded
# so the weight validator doesn't complain.
if hasattr(self.pooler, "head"):
head = self.pooler.head
projector = getattr(head, "projector", None)
if projector is not None and isinstance(projector, nn.Module):
for name, _ in projector.named_parameters():
loaded.add(f"pooler.head.projector.{name}")
return loaded

View File

@@ -269,6 +269,7 @@ _LATE_INTERACTION_MODELS = {
"HF_ColBERT": ("colbert", "ColBERTModel"),
"ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"),
"ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"),
"ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"),
# [Multimodal]
"ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"),
"ColPaliForRetrieval": ("colpali", "ColPaliModel"),