[Bugfix] Replace PoolingParams.normalize with use_activation (#32243)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-13 18:45:42 +08:00
parent 11b6af5280
commit 0aa8c40552
21 changed files with 68 additions and 70 deletions
--- a/examples/pooling/embed/openai_embedding_long_text/README.md
+++ b/examples/pooling/embed/openai_embedding_long_text/README.md
@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`:
 ```json
 {
  "pooling_type": "auto",
-  "normalize": true,
+  "use_activation": true,
  "enable_chunked_processing": true,
  "max_embed_len": 3072000
 }
--- a/examples/pooling/embed/openai_embedding_long_text/client.py
+++ b/examples/pooling/embed/openai_embedding_long_text/client.py
@@ -14,7 +14,7 @@ Prerequisites:
   # MEAN pooling (processes all chunks, recommended for complete coverage)
   vllm serve intfloat/multilingual-e5-large \
     --pooler-config \
-      '{"pooling_type": "MEAN", "normalize": true, ' \
+      '{"pooling_type": "MEAN", "use_activation": true, ' \
      '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
     --served-model-name multilingual-e5-large \
     --trust-remote-code \
@@ -24,7 +24,7 @@ Prerequisites:
   # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
   vllm serve BAAI/bge-large-en-v1.5 \
     --pooler-config \
-      '{"pooling_type": "CLS", "normalize": true, ' \
+      '{"pooling_type": "CLS", "use_activation": true, ' \
      '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
     --served-model-name bge-large-en-v1.5 \
     --trust-remote-code \
--- a/examples/pooling/embed/openai_embedding_long_text/service.sh
+++ b/examples/pooling/embed/openai_embedding_long_text/service.sh
@@ -96,7 +96,7 @@ echo ""
 echo "🔧 Starting server with enhanced chunked processing configuration..."

 # Build pooler config JSON
-POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
+POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"use_activation\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"

 # Start vLLM server with enhanced chunked processing
 vllm serve "$MODEL_NAME" \
--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@@ -53,7 +53,9 @@ def test_token_embed(llm: LLM):
 def test_pooling_params(llm: LLM):
    def get_outputs(normalize):
        outputs = llm.embed(
-            prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
+            prompts,
+            pooling_params=PoolingParams(use_activation=normalize),
+            use_tqdm=False,
        )
        return torch.tensor([x.outputs.embedding for x in outputs])

--- a/tests/entrypoints/pooling/embed/test_online_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@@ -216,7 +216,7 @@ def server_with_chunked_processing():
        "512",  # Set smaller max_model_len to trigger chunking mechanism
        "--pooler-config",
        (
-            '{"pooling_type": "MEAN", "normalize": true, '
+            '{"pooling_type": "MEAN", "use_activation": true, '
            '"enable_chunked_processing": true, "max_embed_len": 10000}'
        ),
        "--gpu-memory-utilization",
--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@@ -236,17 +236,14 @@ class TestModel:
                    "use_activation": use_activation,
                },
            )
-            if response.status_code != 200:
-                return response
-
            outputs = response.json()
            return torch.tensor([x["score"] for x in outputs["data"]])

-        if model["is_cross_encoder"]:
-            default = get_outputs(use_activation=None)
-            w_activation = get_outputs(use_activation=True)
-            wo_activation = get_outputs(use_activation=False)
+        default = get_outputs(use_activation=None)
+        w_activation = get_outputs(use_activation=True)
+        wo_activation = get_outputs(use_activation=False)

+        if model["is_cross_encoder"]:
            assert torch.allclose(default, w_activation, atol=1e-2), (
                "Default should use activation."
            )
@@ -256,9 +253,3 @@ class TestModel:
            assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
                "w_activation should be close to activation(wo_activation)."
            )
-        else:
-            get_outputs(use_activation=None)
-
-            # The activation parameter only works for the is_cross_encoder model
-            response = get_outputs(use_activation=True)
-            assert response.status_code == 400
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -48,7 +48,7 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
        # asserts on the pooling config files
        assert model_config.pooler_config.seq_pooling_type == "CLS"
        assert model_config.pooler_config.tok_pooling_type == "ALL"
-        assert model_config.pooler_config.normalize
+        assert model_config.pooler_config.use_activation

        # asserts on the tokenizer loaded
        assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
@@ -93,7 +93,7 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
        # asserts on the pooling config files
        assert model_config.pooler_config.seq_pooling_type == "MEAN"
        assert model_config.pooler_config.tok_pooling_type == "ALL"
-        assert model_config.pooler_config.normalize
+        assert model_config.pooler_config.use_activation

        # asserts on the tokenizer loaded
        assert model_config.tokenizer == "intfloat/multilingual-e5-base"
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@@ -66,7 +66,7 @@ def test_embed_models_using_normalize(
        model,
        max_model_len=512,
        dtype=dtype,
-        pooler_config=PoolerConfig(normalize=False),
+        pooler_config=PoolerConfig(use_activation=False),
    ) as vllm_model:
        wo_normalize = torch.tensor(vllm_model.embed(example_prompts))

@@ -74,7 +74,7 @@ def test_embed_models_using_normalize(
        model,
        max_model_len=512,
        dtype=dtype,
-        pooler_config=PoolerConfig(normalize=True),
+        pooler_config=PoolerConfig(use_activation=True),
    ) as vllm_model:
        w_normalize = torch.tensor(vllm_model.embed(example_prompts))

@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
        model,
        max_model_len=512,
        dtype=dtype,
-        pooler_config=PoolerConfig(normalize=False),
+        pooler_config=PoolerConfig(use_activation=False),
    ) as vllm_model:
        wo_normalize = vllm_model.token_embed(example_prompts)

@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
        model,
        max_model_len=512,
        dtype=dtype,
-        pooler_config=PoolerConfig(normalize=True),
+        pooler_config=PoolerConfig(use_activation=True),
    ) as vllm_model:
        w_normalize = vllm_model.token_embed(example_prompts)

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -160,7 +160,7 @@ def test_get_pooling_config():
    model_config = ModelConfig(model_id)

    assert model_config.pooler_config is not None
-    assert model_config.pooler_config.normalize
+    assert model_config.pooler_config.use_activation
    assert model_config.pooler_config.seq_pooling_type == "MEAN"
    assert model_config.pooler_config.tok_pooling_type == "ALL"

--- a/tests/test_pooling_params.py
+++ b/tests/test_pooling_params.py
@@ -18,7 +18,7 @@ EMBEDDING_MODELS = [
 ]

 classify_parameters = ["use_activation"]
-embed_parameters = ["dimensions", "normalize"]
+embed_parameters = ["dimensions", "use_activation"]
 step_pooling_parameters = ["step_tag_id", "returned_token_ids"]


@@ -42,17 +42,17 @@ def test_embed():
    task = "embed"
    model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))

-    pooling_params = PoolingParams(normalize=None)
+    pooling_params = PoolingParams(use_activation=None)
    pooling_params.verify(task=task, model_config=model_config)

-    pooling_params = PoolingParams(normalize=True)
+    pooling_params = PoolingParams(use_activation=True)
    pooling_params.verify(task=task, model_config=model_config)

-    pooling_params = PoolingParams(normalize=False)
+    pooling_params = PoolingParams(use_activation=False)
    pooling_params.verify(task=task, model_config=model_config)

    invalid_parameters = classify_parameters + step_pooling_parameters
-    for p in invalid_parameters:
+    for p in set(invalid_parameters) - set(embed_parameters):
        with pytest.raises(ValueError):
            pooling_params = PoolingParams(**{p: True})
            pooling_params.verify(task=task, model_config=model_config)
@@ -98,7 +98,7 @@ def test_classify(task):
    pooling_params.verify(task=task, model_config=model_config)

    invalid_parameters = embed_parameters + step_pooling_parameters
-    for p in invalid_parameters:
+    for p in set(invalid_parameters) - set(classify_parameters):
        with pytest.raises(ValueError):
            pooling_params = PoolingParams(**{p: True})
            pooling_params.verify(task=task, model_config=model_config)
@@ -111,20 +111,20 @@ def test_token_embed(pooling_type: str):
        pooler_config=PoolerConfig(tok_pooling_type=pooling_type)
    )

-    pooling_params = PoolingParams(normalize=None)
+    pooling_params = PoolingParams(use_activation=None)
    pooling_params.verify(task=task, model_config=model_config)

-    pooling_params = PoolingParams(normalize=True)
+    pooling_params = PoolingParams(use_activation=True)
    pooling_params.verify(task=task, model_config=model_config)

-    pooling_params = PoolingParams(normalize=False)
+    pooling_params = PoolingParams(use_activation=False)
    pooling_params.verify(task=task, model_config=model_config)

    invalid_parameters = classify_parameters
    if pooling_type != "STEP":
        invalid_parameters = classify_parameters + step_pooling_parameters

-    for p in invalid_parameters:
+    for p in set(invalid_parameters) - set(embed_parameters):
        with pytest.raises(ValueError):
            pooling_params = PoolingParams(**{p: True})
            pooling_params.verify(task=task, model_config=model_config)
@@ -150,7 +150,7 @@ def test_token_classify(pooling_type: str):
    if pooling_type != "STEP":
        invalid_parameters = embed_parameters + step_pooling_parameters

-    for p in invalid_parameters:
+    for p in set(invalid_parameters) - set(classify_parameters):
        with pytest.raises(ValueError):
            pooling_params = PoolingParams(**{p: True})
            pooling_params.verify(task=task, model_config=model_config)
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -48,7 +48,7 @@ class PoolerConfig:
    ## for embeddings models
    normalize: bool | None = None
    """
-    Whether to normalize the embeddings outputs. Defaults to True.
+    DEPRECATED: please use `use_activation` instead.
    """
    dimensions: int | None = None
    """
@@ -75,11 +75,11 @@ class PoolerConfig:
    ## for classification models
    softmax: float | None = None
    """
-    softmax will be deprecated, please use use_activation instead.
+    DEPRECATED: please use `use_activation` instead.
    """
    activation: float | None = None
    """
-    activation will be deprecated, please use use_activation instead.
+    DEPRECATED: please use `use_activation` instead.
    """
    use_activation: bool | None = None
    """
@@ -164,17 +164,24 @@ class PoolerConfig:


 def get_use_activation(o: object):
-    if softmax := getattr(o, "softmax", None) is not None:
+    if (normalize := getattr(o, "normalize", None)) is not None:
        logger.warning_once(
-            "softmax will be deprecated and will be removed in v0.15. "
-            "Please use use_activation instead."
+            "`normalize` is deprecated and will be removed in v0.15. "
+            "Please use `use_activation` instead."
+        )
+        return normalize
+
+    if (softmax := getattr(o, "softmax", None)) is not None:
+        logger.warning_once(
+            "`softmax` is deprecated and will be removed in v0.15. "
+            "Please use `use_activation` instead."
        )
        return softmax

-    if activation := getattr(o, "activation", None) is not None:
+    if (activation := getattr(o, "activation", None)) is not None:
        logger.warning_once(
-            "activation will be deprecated and will be removed in v0.15. "
-            "Please use use_activation instead."
+            "`activation` is deprecated and will be removed in v0.15. "
+            "Please use `use_activation` instead."
        )
        return activation

--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
-            normalize=self.normalize,
+            use_activation=self.normalize,
        )


@@ -189,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
-            normalize=self.normalize,
+            use_activation=self.normalize,
        )


--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
-            normalize=self.normalize,
            use_activation=get_use_activation(self),
        )

@@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
-            normalize=self.normalize,
            use_activation=get_use_activation(self),
        )

--- a/vllm/model_executor/layers/pooler/seqwise/heads.py
+++ b/vllm/model_executor/layers/pooler/seqwise/heads.py
@@ -83,7 +83,7 @@ class EmbeddingPoolerHead(SequencePoolerHead):

        # for normalize
        if self.activation is not None:
-            flags = [p.normalize for p in pooling_params]
+            flags = [p.use_activation for p in pooling_params]
            if len(set(flags)) == 1:
                if flags[0]:
                    pooled_data = self.activation(pooled_data)
--- a/vllm/model_executor/layers/pooler/seqwise/poolers.py
+++ b/vllm/model_executor/layers/pooler/seqwise/poolers.py
@@ -95,8 +95,8 @@ def pooler_for_embed(pooler_config: PoolerConfig):
    vllm_config = get_current_vllm_config()
    model_config = vllm_config.model_config
    head = EmbeddingPoolerHead(
-        projector=_load_st_projector(model_config),
        head_dtype=model_config.head_dtype,
+        projector=_load_st_projector(model_config),
        activation=PoolerNormalize(),
    )

@@ -116,9 +116,9 @@ def pooler_for_classify(
    vllm_config = get_current_vllm_config()
    model_config = vllm_config.model_config
    head = ClassifierPoolerHead(
+        head_dtype=model_config.head_dtype,
        classifier=classifier,
        logit_bias=model_config.pooler_config.logit_bias,
-        head_dtype=model_config.head_dtype,
        activation=resolve_classifier_act_fn(
            model_config, static_num_labels=True, act_fn=act_fn
        ),
--- a/vllm/model_executor/layers/pooler/tokwise/heads.py
+++ b/vllm/model_executor/layers/pooler/tokwise/heads.py
@@ -44,14 +44,14 @@ class TokenPoolerHead(nn.Module, ABC):
 class TokenEmbeddingPoolerHead(TokenPoolerHead):
    def __init__(
        self,
-        projector: ProjectorFn | None = None,
        head_dtype: torch.dtype | str | None = None,
+        projector: ProjectorFn | None = None,
        activation: ActivationFn | None = None,
    ) -> None:
        super().__init__()

-        self.projector = projector
        self.head_dtype = head_dtype
+        self.projector = projector
        self.activation = activation

    def get_supported_tasks(self) -> Set[PoolingTask]:
@@ -79,7 +79,7 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead):
        pooled_data = pooled_data[..., : pooling_param.dimensions]

        # for normalize
-        if self.activation is not None and pooling_param.normalize:
+        if self.activation is not None and pooling_param.use_activation:
            pooled_data = self.activation(pooled_data)

        # pooled_data shape: [n_tokens, embedding_dimension]
--- a/vllm/model_executor/layers/pooler/tokwise/poolers.py
+++ b/vllm/model_executor/layers/pooler/tokwise/poolers.py
@@ -95,8 +95,8 @@ def pooler_for_token_embed(pooler_config: PoolerConfig):
    vllm_config = get_current_vllm_config()
    model_config = vllm_config.model_config
    head = TokenEmbeddingPoolerHead(
-        projector=_load_st_projector(model_config),
        head_dtype=model_config.head_dtype,
+        projector=_load_st_projector(model_config),
        activation=PoolerNormalize(),
    )

@@ -116,9 +116,9 @@ def pooler_for_token_classify(
    vllm_config = get_current_vllm_config()
    model_config = vllm_config.model_config
    head = TokenClassifierPoolerHead(
+        head_dtype=model_config.head_dtype,
        classifier=classifier,
        logit_bias=model_config.pooler_config.logit_bias,
-        head_dtype=model_config.head_dtype,
        activation=resolve_classifier_act_fn(
            model_config, static_num_labels=False, act_fn=act_fn
        ),
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -116,8 +116,8 @@ class BertPooler(SequencePooler):

        # Use lambdas so that weights are not registered under `self.head`
        self.head = EmbeddingPoolerHead(
-            projector=lambda x: self.dense(x),
            head_dtype=head_dtype,
+            projector=lambda x: self.dense(x),
            activation=LambdaPoolerActivation(self.act_fn),
        )

--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -309,12 +309,13 @@ class ModernBertPooler(SequencePooler):
            config.hidden_size,
            eps=config.norm_eps,
            bias=config.norm_bias,
+            dtype=head_dtype,
        )

        # Use lambdas so that weights are not registered under `self.head`
        self.head = EmbeddingPoolerHead(
-            projector=lambda x: self.dense(x),
            head_dtype=head_dtype,
+            projector=lambda x: self.dense(x),
            activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))),
        )

--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -26,9 +26,9 @@ class PoolingParams(
            Set to None to disable truncation.
        dimensions: Reduce the dimensions of embeddings
            if model support matryoshka representation.
-        normalize: Whether to normalize the embeddings outputs.
-        softmax: softmax will be deprecated, please use use_activation instead.
-        activation: activation will be deprecated, please use use_activation instead.
+        normalize: Deprecated, please use use_activation instead.
+        softmax: Deprecated, please use use_activation instead.
+        activation: Deprecated, please use use_activation instead.
        use_activation: Whether to apply activation function to
            the classification outputs.
    """
@@ -63,15 +63,15 @@ class PoolingParams(

    @property
    def all_parameters(self) -> list[str]:
-        return ["dimensions", "normalize", "use_activation"]
+        return ["dimensions", "use_activation"]

    @property
    def valid_parameters(self):
        return {
-            "embed": ["dimensions", "normalize"],
+            "embed": ["dimensions", "use_activation"],
            "classify": ["use_activation"],
            "score": ["use_activation"],
-            "token_embed": ["dimensions", "normalize"],
+            "token_embed": ["dimensions", "use_activation"],
            "token_classify": ["use_activation"],
        }

@@ -162,8 +162,8 @@ class PoolingParams(

    def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
        if self.task in ["embed", "token_embed"]:
-            if self.normalize is None:
-                self.normalize = True
+            if self.use_activation is None:
+                self.use_activation = True

            if self.dimensions is not None and model_config is not None:
                if not model_config.is_matryoshka:
@@ -213,7 +213,6 @@ class PoolingParams(
        return (
            f"PoolingParams("
            f"task={self.task}, "
-            f"normalize={self.normalize}, "
            f"dimensions={self.dimensions}, "
            f"use_activation={self.use_activation}, "
            f"step_tag_id={self.step_tag_id}, "
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -801,7 +801,7 @@ def get_pooling_config(

        logger.info("Found pooling configuration.")

-        config: dict[str, Any] = {"normalize": normalize}
+        config: dict[str, Any] = {"use_activation": normalize}
        for key, val in pooling_dict.items():
            if val is True:
                pooling_type = parse_pooling_type(key)