# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for ColQwen3.5 late interaction model for multi-modal retrieval. ColQwen3.5 is a multi-vector retrieval model based on Qwen3.5 backbone with ColBERT-style late interaction scoring (MaxSim). It produces per-token embeddings for both text and image inputs. """ import pytest import torch from ....conftest import VllmRunner MODELS = [ "athrael-soju/colqwen3.5-4.5B-v3", ] EMBED_DIMS = { "athrael-soju/colqwen3.5-4.5B-v3": 320, } TEXT_QUERIES = [ "What is the capital of France?", "Describe the contents of the document.", ] TEXT_DOCUMENTS = [ "The capital of France is Paris.", "This document contains important financial data.", ] DTYPE = "half" def _run_token_embed_test( vllm_runner: type[VllmRunner], model: str, *, dtype: str, ) -> None: """Verify per-token embedding shape and L2 normalization.""" with vllm_runner( model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True, ) as vllm_model: outputs = vllm_model.token_embed([TEXT_QUERIES[0]]) assert len(outputs) == 1 emb = torch.tensor(outputs[0]) # Token embeddings should be 2D: [num_tokens, embed_dim] assert emb.dim() == 2 assert emb.shape[1] == EMBED_DIMS[model] assert emb.shape[0] > 1 # Verify L2 normalization norms = torch.norm(emb, p=2, dim=-1) torch.testing.assert_close( norms, torch.ones_like(norms), rtol=1e-2, atol=1e-2, ) def _run_late_interaction_test( vllm_runner: type[VllmRunner], model: str, *, dtype: str, ) -> None: """Verify MaxSim scoring matches manual computation.""" from vllm.entrypoints.pooling.score.utils import compute_maxsim_score with vllm_runner( model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True, ) as vllm_model: q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]]) d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]]) q_emb = torch.tensor(q_outputs[0]) d_emb = torch.tensor(d_outputs[0]) manual_score = compute_maxsim_score(q_emb, d_emb).item() vllm_scores = vllm_model.score(TEXT_QUERIES[0], TEXT_DOCUMENTS[0]) assert len(vllm_scores) == 1 assert vllm_scores[0] == pytest.approx(manual_score, rel=0.01) def _run_relevance_test( vllm_runner: type[VllmRunner], model: str, *, dtype: str, ) -> None: """Verify that relevant documents score higher than irrelevant ones.""" query = "What is machine learning?" documents = [ "Machine learning is a subset of artificial intelligence.", "The weather forecast shows rain tomorrow.", "Deep learning uses neural networks for complex tasks.", ] with vllm_runner( model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True, ) as vllm_model: scores = vllm_model.score(query, documents) assert len(scores) == 3 assert scores[0] > scores[1], "ML doc should score higher than weather doc" assert scores[2] > scores[1], "DL doc should score higher than weather doc" @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", [DTYPE]) def test_colqwen3_5_token_embed( vllm_runner, model: str, dtype: str, ) -> None: _run_token_embed_test(vllm_runner, model, dtype=dtype) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", [DTYPE]) def test_colqwen3_5_late_interaction_scoring( vllm_runner, model: str, dtype: str, ) -> None: _run_late_interaction_test(vllm_runner, model, dtype=dtype) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", [DTYPE]) def test_colqwen3_5_relevance_ordering( vllm_runner, model: str, dtype: str, ) -> None: _run_relevance_test(vllm_runner, model, dtype=dtype)