[Model] Add LFM2-ColBERT-350M support (#37528)

Signed-off-by: Ilya Boytsov <ilyaboytsov1805@gmail.com>
2026-03-20 15:57:57 +01:00
parent 9f6d9dd371
commit 8b6c6b9505
6 changed files with 125 additions and 1 deletions
--- a/docs/models/pooling_models/specific_models.md
+++ b/docs/models/pooling_models/specific_models.md
@@ -11,6 +11,7 @@ vLLM supports ColBERT models with multiple encoder backbones:
 | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
 | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
 | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
 | `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` |
 **BERT-based ColBERT** models work out of the box:
@@ -29,6 +30,10 @@ vllm serve lightonai/GTE-ModernColBERT-v1 \
 vllm serve jinaai/jina-colbert-v2 \
    --hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
    --trust-remote-code
 # LFM2 backbone
 vllm serve LiquidAI/LFM2-ColBERT-350M \
    --hf-overrides '{"architectures": ["ColBERTLfm2Model"]}'
 ```
 Then you can use the rerank API:
--- a/docs/models/pooling_models/token_embed.md
+++ b/docs/models/pooling_models/token_embed.md
@@ -39,6 +39,7 @@ Models of any architecture can be converted into embedding models using `--conve
 | Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
 | ------------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | |
 | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | |
 | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | |
 | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | |
--- a/tests/models/language/pooling/test_colbert.py
+++ b/tests/models/language/pooling/test_colbert.py
@@ -59,6 +59,22 @@ COLBERT_MODELS = {
            "model_cls": "AutoModel",
        },
    },
    "lfm2": {
        "model": "LiquidAI/LFM2-ColBERT-350M",
        "colbert_dim": 128,
        "max_model_len": 511,
        "extra_kwargs": {
            "hf_overrides": {
                "architectures": ["ColBERTLfm2Model"],
            },
        },
        "hf_comparison": {
            "weights_file": "1_Dense/model.safetensors",
            "weights_key": "linear.weight",
            "trust_remote_code": False,
            "model_cls": "AutoModel",
        },
    },
 }
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -628,6 +628,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
        trust_remote_code=True,
        hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]},
    ),
    "ColBERTLfm2Model": _HfExamplesInfo(
        "LiquidAI/LFM2-ColBERT-350M",
        trust_remote_code=True,
        hf_overrides={"architectures": ["ColBERTLfm2Model"]},
    ),
    # [Multimodal]
    "ColModernVBertForRetrieval": _HfExamplesInfo(
        "ModernVBERT/colmodernvbert-merged",
--- a/vllm/model_executor/models/colbert.py
+++ b/vllm/model_executor/models/colbert.py
@@ -27,8 +27,9 @@ from vllm.model_executor.layers.pooler import Pooler
 from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
 from .bert import BertEmbeddingModel, BertModel
-from .interfaces import SupportsLateInteraction
+from .interfaces import HasInnerState, IsHybrid, SupportsLateInteraction
 from .interfaces_base import default_pooling_type
 from .lfm2 import Lfm2ForCausalLM, Lfm2Model
 class ColBERTMixin(nn.Module, SupportsLateInteraction):
@@ -414,3 +415,98 @@ class ColBERTJinaRobertaModel(ColBERTMixin, nn.Module):
            loaded.update(colbert_loaded)
        return loaded
 # -----------------------------------------------------------------------
 # Concrete model: ColBERT + LFM2 backbone
 # -----------------------------------------------------------------------
@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL")
 class ColBERTLfm2Model(ColBERTMixin, nn.Module, HasInnerState, IsHybrid):
    """ColBERT late interaction model with LFM2 backbone.
    For ``LiquidAI/LFM2-ColBERT-350M`` and similar models.
    The projection is auto-loaded from sentence-transformers ``1_Dense/``
    when not present in the main checkpoint.
    """
    is_pooling_model = True
    # LFM2 is a hybrid model (attention + SSM layers); these flags ensure
    # HybridAttentionMambaModelConfig.verify_and_update_config runs so that
    # mamba_block_size and related cache settings are correctly initialised.
    is_hybrid = True
    has_inner_state = True
    @classmethod
    def get_mamba_state_shape_from_config(cls, vllm_config: VllmConfig):
        return Lfm2ForCausalLM.get_mamba_state_shape_from_config(vllm_config)
    @classmethod
    def get_mamba_state_dtype_from_config(cls, vllm_config: VllmConfig):
        return Lfm2ForCausalLM.get_mamba_state_dtype_from_config(vllm_config)
    @classmethod
    def get_mamba_state_copy_func(cls):
        return Lfm2ForCausalLM.get_mamba_state_copy_func()
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
        colbert_dim = self.get_colbert_dim_from_config(config)
        self._init_colbert_components(
            hidden_size=config.hidden_size,
            colbert_dim=colbert_dim,
            head_dtype=vllm_config.model_config.head_dtype,
        )
        self.model = Lfm2Model(
            vllm_config=vllm_config,
            prefix=prefix,
        )
        pooler_config = vllm_config.model_config.pooler_config
        assert pooler_config is not None
        self.pooler = self._build_colbert_pooler(pooler_config)
    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.embed_input_ids(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors=None,
        inputs_embeds: torch.Tensor | None = None,
    ) -> torch.Tensor:
        return self.model(
            input_ids=input_ids,
            positions=positions,
            inputs_embeds=inputs_embeds,
            intermediate_tensors=intermediate_tensors,
        )
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
        other_weights, colbert_loaded = self._load_colbert_weights(weights)
        # Strip "model." prefix added by the embedding adapter
        model_weights = [
            (n[len("model.") :] if n.startswith("model.") else n, w)
            for n, w in other_weights
        ]
        loaded_model = self.model.load_weights(model_weights)
        loaded = {f"model.{name}" for name in loaded_model} | colbert_loaded
        # When the ST projector was auto-loaded during init
        # (not from the main checkpoint), mark its params as loaded
        # so the weight validator doesn't complain.
        if hasattr(self.pooler, "head"):
            head = self.pooler.head
            projector = getattr(head, "projector", None)
            if projector is not None and isinstance(projector, nn.Module):
                for name, _ in projector.named_parameters():
                    loaded.add(f"pooler.head.projector.{name}")
        return loaded
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -269,6 +269,7 @@ _LATE_INTERACTION_MODELS = {
    "HF_ColBERT": ("colbert", "ColBERTModel"),
    "ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"),
    "ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"),
    "ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"),
    # [Multimodal]
    "ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"),
    "ColPaliForRetrieval": ("colpali", "ColPaliModel"),