[Model] Add LFM2-ColBERT-350M support (#37528)

Signed-off-by: Ilya Boytsov <ilyaboytsov1805@gmail.com>
2026-03-20 15:57:57 +01:00
parent 9f6d9dd371
commit 8b6c6b9505
6 changed files with 125 additions and 1 deletions
--- a/docs/models/pooling_models/specific_models.md
+++ b/docs/models/pooling_models/specific_models.md
@@ -11,6 +11,7 @@ vLLM supports ColBERT models with multiple encoder backbones:
 | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
 | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
 | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
+| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` |

 **BERT-based ColBERT** models work out of the box:

@@ -29,6 +30,10 @@ vllm serve lightonai/GTE-ModernColBERT-v1 \
 vllm serve jinaai/jina-colbert-v2 \
    --hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
    --trust-remote-code
+
+# LFM2 backbone
+vllm serve LiquidAI/LFM2-ColBERT-350M \
+    --hf-overrides '{"architectures": ["ColBERTLfm2Model"]}'
 ```

 Then you can use the rerank API:
--- a/docs/models/pooling_models/token_embed.md
+++ b/docs/models/pooling_models/token_embed.md
@@ -39,6 +39,7 @@ Models of any architecture can be converted into embedding models using `--conve

 | Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
 | ------------ | ------ | ----------------- | -------------------- | ------------------------- |
+| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | |
 | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | |
 | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | |
 | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | |
--- a/tests/models/language/pooling/test_colbert.py
+++ b/tests/models/language/pooling/test_colbert.py
@@ -59,6 +59,22 @@ COLBERT_MODELS = {
            "model_cls": "AutoModel",
        },
    },
+    "lfm2": {
+        "model": "LiquidAI/LFM2-ColBERT-350M",
+        "colbert_dim": 128,
+        "max_model_len": 511,
+        "extra_kwargs": {
+            "hf_overrides": {
+                "architectures": ["ColBERTLfm2Model"],
+            },
+        },
+        "hf_comparison": {
+            "weights_file": "1_Dense/model.safetensors",
+            "weights_key": "linear.weight",
+            "trust_remote_code": False,
+            "model_cls": "AutoModel",
+        },
+    },
 }


--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -628,6 +628,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
        trust_remote_code=True,
        hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]},
    ),
+    "ColBERTLfm2Model": _HfExamplesInfo(
+        "LiquidAI/LFM2-ColBERT-350M",
+        trust_remote_code=True,
+        hf_overrides={"architectures": ["ColBERTLfm2Model"]},
+    ),
    # [Multimodal]
    "ColModernVBertForRetrieval": _HfExamplesInfo(
        "ModernVBERT/colmodernvbert-merged",
--- a/vllm/model_executor/models/colbert.py
+++ b/vllm/model_executor/models/colbert.py
@@ -27,8 +27,9 @@ from vllm.model_executor.layers.pooler import Pooler
 from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed

 from .bert import BertEmbeddingModel, BertModel
-from .interfaces import SupportsLateInteraction
+from .interfaces import HasInnerState, IsHybrid, SupportsLateInteraction
 from .interfaces_base import default_pooling_type
+from .lfm2 import Lfm2ForCausalLM, Lfm2Model


 class ColBERTMixin(nn.Module, SupportsLateInteraction):
@@ -414,3 +415,98 @@ class ColBERTJinaRobertaModel(ColBERTMixin, nn.Module):
            loaded.update(colbert_loaded)

        return loaded
+
+
+# -----------------------------------------------------------------------
+# Concrete model: ColBERT + LFM2 backbone
+# -----------------------------------------------------------------------
+
+
+@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL")
+class ColBERTLfm2Model(ColBERTMixin, nn.Module, HasInnerState, IsHybrid):
+    """ColBERT late interaction model with LFM2 backbone.
+
+    For ``LiquidAI/LFM2-ColBERT-350M`` and similar models.
+
+    The projection is auto-loaded from sentence-transformers ``1_Dense/``
+    when not present in the main checkpoint.
+    """
+
+    is_pooling_model = True
+    # LFM2 is a hybrid model (attention + SSM layers); these flags ensure
+    # HybridAttentionMambaModelConfig.verify_and_update_config runs so that
+    # mamba_block_size and related cache settings are correctly initialised.
+    is_hybrid = True
+    has_inner_state = True
+
+    @classmethod
+    def get_mamba_state_shape_from_config(cls, vllm_config: VllmConfig):
+        return Lfm2ForCausalLM.get_mamba_state_shape_from_config(vllm_config)
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(cls, vllm_config: VllmConfig):
+        return Lfm2ForCausalLM.get_mamba_state_dtype_from_config(vllm_config)
+
+    @classmethod
+    def get_mamba_state_copy_func(cls):
+        return Lfm2ForCausalLM.get_mamba_state_copy_func()
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        colbert_dim = self.get_colbert_dim_from_config(config)
+        self._init_colbert_components(
+            hidden_size=config.hidden_size,
+            colbert_dim=colbert_dim,
+            head_dtype=vllm_config.model_config.head_dtype,
+        )
+
+        self.model = Lfm2Model(
+            vllm_config=vllm_config,
+            prefix=prefix,
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+        self.pooler = self._build_colbert_pooler(pooler_config)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors=None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.model(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        other_weights, colbert_loaded = self._load_colbert_weights(weights)
+
+        # Strip "model." prefix added by the embedding adapter
+        model_weights = [
+            (n[len("model.") :] if n.startswith("model.") else n, w)
+            for n, w in other_weights
+        ]
+        loaded_model = self.model.load_weights(model_weights)
+        loaded = {f"model.{name}" for name in loaded_model} | colbert_loaded
+
+        # When the ST projector was auto-loaded during init
+        # (not from the main checkpoint), mark its params as loaded
+        # so the weight validator doesn't complain.
+        if hasattr(self.pooler, "head"):
+            head = self.pooler.head
+            projector = getattr(head, "projector", None)
+            if projector is not None and isinstance(projector, nn.Module):
+                for name, _ in projector.named_parameters():
+                    loaded.add(f"pooler.head.projector.{name}")
+
+        return loaded
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -269,6 +269,7 @@ _LATE_INTERACTION_MODELS = {
    "HF_ColBERT": ("colbert", "ColBERTModel"),
    "ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"),
    "ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"),
+    "ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"),
    # [Multimodal]
    "ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"),
    "ColPaliForRetrieval": ("colpali", "ColPaliModel"),