diff --git a/docs/models/pooling_models/specific_models.md b/docs/models/pooling_models/specific_models.md index 4b0027a3d..0d908c1aa 100644 --- a/docs/models/pooling_models/specific_models.md +++ b/docs/models/pooling_models/specific_models.md @@ -11,6 +11,7 @@ vLLM supports ColBERT models with multiple encoder backbones: | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | +| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | **BERT-based ColBERT** models work out of the box: @@ -29,6 +30,10 @@ vllm serve lightonai/GTE-ModernColBERT-v1 \ vllm serve jinaai/jina-colbert-v2 \ --hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \ --trust-remote-code + +# LFM2 backbone +vllm serve LiquidAI/LFM2-ColBERT-350M \ + --hf-overrides '{"architectures": ["ColBERTLfm2Model"]}' ``` Then you can use the rerank API: diff --git a/docs/models/pooling_models/token_embed.md b/docs/models/pooling_models/token_embed.md index c950d2e99..e847fb09b 100644 --- a/docs/models/pooling_models/token_embed.md +++ b/docs/models/pooling_models/token_embed.md @@ -39,6 +39,7 @@ Models of any architecture can be converted into embedding models using `--conve | Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | | ------------ | ------ | ----------------- | -------------------- | ------------------------- | +| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | | | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | | | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | | | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | | diff --git a/tests/models/language/pooling/test_colbert.py b/tests/models/language/pooling/test_colbert.py index 6edd9c28c..a245f879b 100644 --- a/tests/models/language/pooling/test_colbert.py +++ b/tests/models/language/pooling/test_colbert.py @@ -59,6 +59,22 @@ COLBERT_MODELS = { "model_cls": "AutoModel", }, }, + "lfm2": { + "model": "LiquidAI/LFM2-ColBERT-350M", + "colbert_dim": 128, + "max_model_len": 511, + "extra_kwargs": { + "hf_overrides": { + "architectures": ["ColBERTLfm2Model"], + }, + }, + "hf_comparison": { + "weights_file": "1_Dense/model.safetensors", + "weights_key": "linear.weight", + "trust_remote_code": False, + "model_cls": "AutoModel", + }, + }, } diff --git a/tests/models/registry.py b/tests/models/registry.py index aac707a90..ff997706c 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -628,6 +628,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = { trust_remote_code=True, hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]}, ), + "ColBERTLfm2Model": _HfExamplesInfo( + "LiquidAI/LFM2-ColBERT-350M", + trust_remote_code=True, + hf_overrides={"architectures": ["ColBERTLfm2Model"]}, + ), # [Multimodal] "ColModernVBertForRetrieval": _HfExamplesInfo( "ModernVBERT/colmodernvbert-merged", diff --git a/vllm/model_executor/models/colbert.py b/vllm/model_executor/models/colbert.py index 66def505f..7b6889899 100644 --- a/vllm/model_executor/models/colbert.py +++ b/vllm/model_executor/models/colbert.py @@ -27,8 +27,9 @@ from vllm.model_executor.layers.pooler import Pooler from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed from .bert import BertEmbeddingModel, BertModel -from .interfaces import SupportsLateInteraction +from .interfaces import HasInnerState, IsHybrid, SupportsLateInteraction from .interfaces_base import default_pooling_type +from .lfm2 import Lfm2ForCausalLM, Lfm2Model class ColBERTMixin(nn.Module, SupportsLateInteraction): @@ -414,3 +415,98 @@ class ColBERTJinaRobertaModel(ColBERTMixin, nn.Module): loaded.update(colbert_loaded) return loaded + + +# ----------------------------------------------------------------------- +# Concrete model: ColBERT + LFM2 backbone +# ----------------------------------------------------------------------- + + +@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL") +class ColBERTLfm2Model(ColBERTMixin, nn.Module, HasInnerState, IsHybrid): + """ColBERT late interaction model with LFM2 backbone. + + For ``LiquidAI/LFM2-ColBERT-350M`` and similar models. + + The projection is auto-loaded from sentence-transformers ``1_Dense/`` + when not present in the main checkpoint. + """ + + is_pooling_model = True + # LFM2 is a hybrid model (attention + SSM layers); these flags ensure + # HybridAttentionMambaModelConfig.verify_and_update_config runs so that + # mamba_block_size and related cache settings are correctly initialised. + is_hybrid = True + has_inner_state = True + + @classmethod + def get_mamba_state_shape_from_config(cls, vllm_config: VllmConfig): + return Lfm2ForCausalLM.get_mamba_state_shape_from_config(vllm_config) + + @classmethod + def get_mamba_state_dtype_from_config(cls, vllm_config: VllmConfig): + return Lfm2ForCausalLM.get_mamba_state_dtype_from_config(vllm_config) + + @classmethod + def get_mamba_state_copy_func(cls): + return Lfm2ForCausalLM.get_mamba_state_copy_func() + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + + colbert_dim = self.get_colbert_dim_from_config(config) + self._init_colbert_components( + hidden_size=config.hidden_size, + colbert_dim=colbert_dim, + head_dtype=vllm_config.model_config.head_dtype, + ) + + self.model = Lfm2Model( + vllm_config=vllm_config, + prefix=prefix, + ) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + self.pooler = self._build_colbert_pooler(pooler_config) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors=None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.model( + input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + other_weights, colbert_loaded = self._load_colbert_weights(weights) + + # Strip "model." prefix added by the embedding adapter + model_weights = [ + (n[len("model.") :] if n.startswith("model.") else n, w) + for n, w in other_weights + ] + loaded_model = self.model.load_weights(model_weights) + loaded = {f"model.{name}" for name in loaded_model} | colbert_loaded + + # When the ST projector was auto-loaded during init + # (not from the main checkpoint), mark its params as loaded + # so the weight validator doesn't complain. + if hasattr(self.pooler, "head"): + head = self.pooler.head + projector = getattr(head, "projector", None) + if projector is not None and isinstance(projector, nn.Module): + for name, _ in projector.named_parameters(): + loaded.add(f"pooler.head.projector.{name}") + + return loaded diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 9b1e52722..c3e7edb7d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -269,6 +269,7 @@ _LATE_INTERACTION_MODELS = { "HF_ColBERT": ("colbert", "ColBERTModel"), "ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"), "ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"), + "ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"), # [Multimodal] "ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"), "ColPaliForRetrieval": ("colpali", "ColPaliModel"),