[Frontend] Support multimodal inputs for late-interaction scoring (ColQwen3) + NewModel: nvidia/nemotron-colembed (#34574)

Signed-off-by: craftsangjae <craftsangjae@gmail.com>
This commit is contained in:
Kata Coder
2026-02-21 13:01:40 +09:00
committed by GitHub
parent 11be2c74dc
commit 5719a4e4e6
10 changed files with 532 additions and 66 deletions

View File

@@ -16,6 +16,7 @@ Based on: Qwen3-VL backbone with custom text projection
Target models:
- TomoroAI/tomoro-colqwen3-embed-8b
- OpenSearch-AI/Ops-Colqwen3-4B
- nvidia/nemotron-colembed-vl-4b-v2
"""
from collections.abc import Iterable, Mapping
@@ -229,13 +230,14 @@ class ColQwen3Model(
if not isinstance(hidden_states, torch.Tensor):
return hidden_states # type: ignore
proj_dtype = self.custom_text_proj.weight.dtype # type: ignore
if hidden_states.dtype != proj_dtype:
hidden_states = hidden_states.to(proj_dtype)
if self.custom_text_proj is not None:
proj_dtype = self.custom_text_proj.weight.dtype
if hidden_states.dtype != proj_dtype:
hidden_states = hidden_states.to(proj_dtype)
hidden_states = self.custom_text_proj(hidden_states)
# Project to embedding dimension and L2 normalize
proj = self.custom_text_proj(hidden_states) # type: ignore
return torch.nn.functional.normalize(proj, p=2, dim=-1)
# L2 normalize
return torch.nn.functional.normalize(hidden_states, p=2, dim=-1)
# Names used for the projection layer across different ColQwen3 variants
_PROJ_LAYER_NAMES = {

View File

@@ -256,6 +256,7 @@ _EMBEDDING_MODELS = {
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
"ColQwen3": ("colqwen3", "ColQwen3Model"),
"OpsColQwen3Model": ("colqwen3", "ColQwen3Model"),
"Qwen3VLNemotronEmbedModel": ("colqwen3", "ColQwen3Model"),
"SiglipModel": ("siglip", "SiglipEmbeddingModel"),
# Technically Terratorch models work on images, both in
# input and output. I am adding it here because it piggy-backs on embedding