[Frontend] Support multimodal inputs for late-interaction scoring (ColQwen3) + NewModel: nvidia/nemotron-colembed (#34574)

Signed-off-by: craftsangjae <craftsangjae@gmail.com>
2026-02-21 13:01:40 +09:00
parent 11be2c74dc
commit 5719a4e4e6
10 changed files with 532 additions and 66 deletions
--- a/vllm/model_executor/models/colqwen3.py
+++ b/vllm/model_executor/models/colqwen3.py
@@ -16,6 +16,7 @@ Based on: Qwen3-VL backbone with custom text projection
 Target models:
 - TomoroAI/tomoro-colqwen3-embed-8b
 - OpenSearch-AI/Ops-Colqwen3-4B
+- nvidia/nemotron-colembed-vl-4b-v2
 """

 from collections.abc import Iterable, Mapping
@@ -229,13 +230,14 @@ class ColQwen3Model(
        if not isinstance(hidden_states, torch.Tensor):
            return hidden_states  # type: ignore

-        proj_dtype = self.custom_text_proj.weight.dtype  # type: ignore
-        if hidden_states.dtype != proj_dtype:
-            hidden_states = hidden_states.to(proj_dtype)
+        if self.custom_text_proj is not None:
+            proj_dtype = self.custom_text_proj.weight.dtype
+            if hidden_states.dtype != proj_dtype:
+                hidden_states = hidden_states.to(proj_dtype)
+            hidden_states = self.custom_text_proj(hidden_states)

-        # Project to embedding dimension and L2 normalize
-        proj = self.custom_text_proj(hidden_states)  # type: ignore
-        return torch.nn.functional.normalize(proj, p=2, dim=-1)
+        # L2 normalize
+        return torch.nn.functional.normalize(hidden_states, p=2, dim=-1)

    # Names used for the projection layer across different ColQwen3 variants
    _PROJ_LAYER_NAMES = {
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -256,6 +256,7 @@ _EMBEDDING_MODELS = {
    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
    "ColQwen3": ("colqwen3", "ColQwen3Model"),
    "OpsColQwen3Model": ("colqwen3", "ColQwen3Model"),
+    "Qwen3VLNemotronEmbedModel": ("colqwen3", "ColQwen3Model"),
    "SiglipModel": ("siglip", "SiglipEmbeddingModel"),
    # Technically Terratorch models work on images, both in
    # input and output. I am adding it here because it piggy-backs on embedding