[Model] Consolidate score logic by introduce score_type (#36479)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-03-10 21:32:25 +08:00
parent 409c4e632d
commit a3189a08b0
14 changed files with 213 additions and 194 deletions
--- a/vllm/model_executor/models/colqwen3.py
+++ b/vllm/model_executor/models/colqwen3.py
@@ -20,7 +20,6 @@ Target models:
 """

 from collections.abc import Iterable, Mapping
-from typing import ClassVar, Literal

 import torch
 import torch.nn as nn
@@ -31,6 +30,7 @@ from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY

+from .interfaces import SupportsLateInteraction
 from .interfaces_base import default_pooling_type
 from .qwen2_vl import Qwen2VLMultiModalDataParser
 from .qwen3_vl import (
@@ -113,9 +113,7 @@ class ColQwen3ProcessingInfo(Qwen3VLProcessingInfo):
    info=ColQwen3ProcessingInfo,
    dummy_inputs=Qwen3VLDummyInputsBuilder,
 )
-class ColQwen3Model(
-    Qwen3VLForConditionalGeneration,
-):
+class ColQwen3Model(Qwen3VLForConditionalGeneration, SupportsLateInteraction):
    """ColQwen3 late interaction model for multi-modal retrieval/reranking.

    This model extends Qwen3VLForConditionalGeneration with a ColBERT-style
@@ -132,16 +130,11 @@ class ColQwen3Model(

    Attributes:
        custom_text_proj: Linear projection from hidden_size to embed_dim
-        supports_late_interaction: Flag indicating this model uses late
-            interaction scoring
    """

    # Mark this as a pooling model so vLLM routes to pooler path
    is_pooling_model = True

-    # Mark this model as supporting late interaction scoring
-    supports_late_interaction: ClassVar[Literal[True]] = True
-
    # Override hf_to_vllm_mapper to handle ColQwen3 weight naming.
    # NOTE: WeightsMapper applies ALL matching prefix rules sequentially
    # (no early exit), so more-specific prefixes must come first.