diff --git a/examples/pooling/embed/openai_embedding_long_text/README.md b/examples/pooling/embed/openai_embedding_long_text/README.md index 00d3ded3e..0eda60810 100644 --- a/examples/pooling/embed/openai_embedding_long_text/README.md +++ b/examples/pooling/embed/openai_embedding_long_text/README.md @@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`: ```json { "pooling_type": "auto", - "normalize": true, + "use_activation": true, "enable_chunked_processing": true, "max_embed_len": 3072000 } diff --git a/examples/pooling/embed/openai_embedding_long_text/client.py b/examples/pooling/embed/openai_embedding_long_text/client.py index 4a3674bb3..7cc33b1f2 100644 --- a/examples/pooling/embed/openai_embedding_long_text/client.py +++ b/examples/pooling/embed/openai_embedding_long_text/client.py @@ -14,7 +14,7 @@ Prerequisites: # MEAN pooling (processes all chunks, recommended for complete coverage) vllm serve intfloat/multilingual-e5-large \ --pooler-config \ - '{"pooling_type": "MEAN", "normalize": true, ' \ + '{"pooling_type": "MEAN", "use_activation": true, ' \ '"enable_chunked_processing": true, "max_embed_len": 3072000}' \ --served-model-name multilingual-e5-large \ --trust-remote-code \ @@ -24,7 +24,7 @@ Prerequisites: # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks) vllm serve BAAI/bge-large-en-v1.5 \ --pooler-config \ - '{"pooling_type": "CLS", "normalize": true, ' \ + '{"pooling_type": "CLS", "use_activation": true, ' \ '"enable_chunked_processing": true, "max_embed_len": 1048576}' \ --served-model-name bge-large-en-v1.5 \ --trust-remote-code \ diff --git a/examples/pooling/embed/openai_embedding_long_text/service.sh b/examples/pooling/embed/openai_embedding_long_text/service.sh index b5c927494..0353b8f5a 100644 --- a/examples/pooling/embed/openai_embedding_long_text/service.sh +++ b/examples/pooling/embed/openai_embedding_long_text/service.sh @@ -96,7 +96,7 @@ echo "" echo "🔧 Starting server with enhanced chunked processing configuration..." # Build pooler config JSON -POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}" +POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"use_activation\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}" # Start vLLM server with enhanced chunked processing vllm serve "$MODEL_NAME" \ diff --git a/tests/entrypoints/pooling/embed/test_offline.py b/tests/entrypoints/pooling/embed/test_offline.py index ea46b7401..44328343f 100644 --- a/tests/entrypoints/pooling/embed/test_offline.py +++ b/tests/entrypoints/pooling/embed/test_offline.py @@ -53,7 +53,9 @@ def test_token_embed(llm: LLM): def test_pooling_params(llm: LLM): def get_outputs(normalize): outputs = llm.embed( - prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False + prompts, + pooling_params=PoolingParams(use_activation=normalize), + use_tqdm=False, ) return torch.tensor([x.outputs.embedding for x in outputs]) diff --git a/tests/entrypoints/pooling/embed/test_online_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py index 316a85264..eaefbc023 100644 --- a/tests/entrypoints/pooling/embed/test_online_long_text.py +++ b/tests/entrypoints/pooling/embed/test_online_long_text.py @@ -216,7 +216,7 @@ def server_with_chunked_processing(): "512", # Set smaller max_model_len to trigger chunking mechanism "--pooler-config", ( - '{"pooling_type": "MEAN", "normalize": true, ' + '{"pooling_type": "MEAN", "use_activation": true, ' '"enable_chunked_processing": true, "max_embed_len": 10000}' ), "--gpu-memory-utilization", diff --git a/tests/entrypoints/pooling/score/test_online_score.py b/tests/entrypoints/pooling/score/test_online_score.py index 6c08027ee..053a836f6 100644 --- a/tests/entrypoints/pooling/score/test_online_score.py +++ b/tests/entrypoints/pooling/score/test_online_score.py @@ -236,17 +236,14 @@ class TestModel: "use_activation": use_activation, }, ) - if response.status_code != 200: - return response - outputs = response.json() return torch.tensor([x["score"] for x in outputs["data"]]) - if model["is_cross_encoder"]: - default = get_outputs(use_activation=None) - w_activation = get_outputs(use_activation=True) - wo_activation = get_outputs(use_activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) + if model["is_cross_encoder"]: assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." ) @@ -256,9 +253,3 @@ class TestModel: assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), ( "w_activation should be close to activation(wo_activation)." ) - else: - get_outputs(use_activation=None) - - # The activation parameter only works for the is_cross_encoder model - response = get_outputs(use_activation=True) - assert response.status_code == 400 diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 0a923cc22..785335e99 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -48,7 +48,7 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): # asserts on the pooling config files assert model_config.pooler_config.seq_pooling_type == "CLS" assert model_config.pooler_config.tok_pooling_type == "ALL" - assert model_config.pooler_config.normalize + assert model_config.pooler_config.use_activation # asserts on the tokenizer loaded assert model_config.tokenizer == "BAAI/bge-base-en-v1.5" @@ -93,7 +93,7 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): # asserts on the pooling config files assert model_config.pooler_config.seq_pooling_type == "MEAN" assert model_config.pooler_config.tok_pooling_type == "ALL" - assert model_config.pooler_config.normalize + assert model_config.pooler_config.use_activation # asserts on the tokenizer loaded assert model_config.tokenizer == "intfloat/multilingual-e5-base" diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py index deb5de984..a5a0c07e0 100644 --- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -66,7 +66,7 @@ def test_embed_models_using_normalize( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(normalize=False), + pooler_config=PoolerConfig(use_activation=False), ) as vllm_model: wo_normalize = torch.tensor(vllm_model.embed(example_prompts)) @@ -74,7 +74,7 @@ def test_embed_models_using_normalize( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(normalize=True), + pooler_config=PoolerConfig(use_activation=True), ) as vllm_model: w_normalize = torch.tensor(vllm_model.embed(example_prompts)) @@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(normalize=False), + pooler_config=PoolerConfig(use_activation=False), ) as vllm_model: wo_normalize = vllm_model.token_embed(example_prompts) @@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(normalize=True), + pooler_config=PoolerConfig(use_activation=True), ) as vllm_model: w_normalize = vllm_model.token_embed(example_prompts) diff --git a/tests/test_config.py b/tests/test_config.py index 358283b4b..8c1bf6c40 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -162,7 +162,7 @@ def test_get_pooling_config(): model_config = ModelConfig(model_id) assert model_config.pooler_config is not None - assert model_config.pooler_config.normalize + assert model_config.pooler_config.use_activation assert model_config.pooler_config.seq_pooling_type == "MEAN" assert model_config.pooler_config.tok_pooling_type == "ALL" diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py index 2c77c6b72..28dedc10e 100644 --- a/tests/test_pooling_params.py +++ b/tests/test_pooling_params.py @@ -18,7 +18,7 @@ EMBEDDING_MODELS = [ ] classify_parameters = ["use_activation"] -embed_parameters = ["dimensions", "normalize"] +embed_parameters = ["dimensions", "use_activation"] step_pooling_parameters = ["step_tag_id", "returned_token_ids"] @@ -42,17 +42,17 @@ def test_embed(): task = "embed" model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS")) - pooling_params = PoolingParams(normalize=None) + pooling_params = PoolingParams(use_activation=None) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(normalize=True) + pooling_params = PoolingParams(use_activation=True) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(normalize=False) + pooling_params = PoolingParams(use_activation=False) pooling_params.verify(task=task, model_config=model_config) invalid_parameters = classify_parameters + step_pooling_parameters - for p in invalid_parameters: + for p in set(invalid_parameters) - set(embed_parameters): with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) pooling_params.verify(task=task, model_config=model_config) @@ -98,7 +98,7 @@ def test_classify(task): pooling_params.verify(task=task, model_config=model_config) invalid_parameters = embed_parameters + step_pooling_parameters - for p in invalid_parameters: + for p in set(invalid_parameters) - set(classify_parameters): with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) pooling_params.verify(task=task, model_config=model_config) @@ -111,20 +111,20 @@ def test_token_embed(pooling_type: str): pooler_config=PoolerConfig(tok_pooling_type=pooling_type) ) - pooling_params = PoolingParams(normalize=None) + pooling_params = PoolingParams(use_activation=None) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(normalize=True) + pooling_params = PoolingParams(use_activation=True) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(normalize=False) + pooling_params = PoolingParams(use_activation=False) pooling_params.verify(task=task, model_config=model_config) invalid_parameters = classify_parameters if pooling_type != "STEP": invalid_parameters = classify_parameters + step_pooling_parameters - for p in invalid_parameters: + for p in set(invalid_parameters) - set(embed_parameters): with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) pooling_params.verify(task=task, model_config=model_config) @@ -150,7 +150,7 @@ def test_token_classify(pooling_type: str): if pooling_type != "STEP": invalid_parameters = embed_parameters + step_pooling_parameters - for p in invalid_parameters: + for p in set(invalid_parameters) - set(classify_parameters): with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) pooling_params.verify(task=task, model_config=model_config) diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index afcc697bb..a3b1f1cbe 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -48,7 +48,7 @@ class PoolerConfig: ## for embeddings models normalize: bool | None = None """ - Whether to normalize the embeddings outputs. Defaults to True. + DEPRECATED: please use `use_activation` instead. """ dimensions: int | None = None """ @@ -75,11 +75,11 @@ class PoolerConfig: ## for classification models softmax: float | None = None """ - softmax will be deprecated, please use use_activation instead. + DEPRECATED: please use `use_activation` instead. """ activation: float | None = None """ - activation will be deprecated, please use use_activation instead. + DEPRECATED: please use `use_activation` instead. """ use_activation: bool | None = None """ @@ -164,17 +164,24 @@ class PoolerConfig: def get_use_activation(o: object): - if softmax := getattr(o, "softmax", None) is not None: + if (normalize := getattr(o, "normalize", None)) is not None: logger.warning_once( - "softmax will be deprecated and will be removed in v0.15. " - "Please use use_activation instead." + "`normalize` is deprecated and will be removed in v0.15. " + "Please use `use_activation` instead." + ) + return normalize + + if (softmax := getattr(o, "softmax", None)) is not None: + logger.warning_once( + "`softmax` is deprecated and will be removed in v0.15. " + "Please use `use_activation` instead." ) return softmax - if activation := getattr(o, "activation", None) is not None: + if (activation := getattr(o, "activation", None)) is not None: logger.warning_once( - "activation will be deprecated and will be removed in v0.15. " - "Please use use_activation instead." + "`activation` is deprecated and will be removed in v0.15. " + "Please use `use_activation` instead." ) return activation diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py index 3829a1a6a..d8b85ec2b 100644 --- a/vllm/entrypoints/pooling/embed/protocol.py +++ b/vllm/entrypoints/pooling/embed/protocol.py @@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, dimensions=self.dimensions, - normalize=self.normalize, + use_activation=self.normalize, ) @@ -189,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, dimensions=self.dimensions, - normalize=self.normalize, + use_activation=self.normalize, ) diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py index 76b361b49..83dafc2ee 100644 --- a/vllm/entrypoints/pooling/pooling/protocol.py +++ b/vllm/entrypoints/pooling/pooling/protocol.py @@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, dimensions=self.dimensions, - normalize=self.normalize, use_activation=get_use_activation(self), ) @@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, dimensions=self.dimensions, - normalize=self.normalize, use_activation=get_use_activation(self), ) diff --git a/vllm/model_executor/layers/pooler/seqwise/heads.py b/vllm/model_executor/layers/pooler/seqwise/heads.py index 21a94a89e..42059284e 100644 --- a/vllm/model_executor/layers/pooler/seqwise/heads.py +++ b/vllm/model_executor/layers/pooler/seqwise/heads.py @@ -83,7 +83,7 @@ class EmbeddingPoolerHead(SequencePoolerHead): # for normalize if self.activation is not None: - flags = [p.normalize for p in pooling_params] + flags = [p.use_activation for p in pooling_params] if len(set(flags)) == 1: if flags[0]: pooled_data = self.activation(pooled_data) diff --git a/vllm/model_executor/layers/pooler/seqwise/poolers.py b/vllm/model_executor/layers/pooler/seqwise/poolers.py index eb058849e..8bf3e25e6 100644 --- a/vllm/model_executor/layers/pooler/seqwise/poolers.py +++ b/vllm/model_executor/layers/pooler/seqwise/poolers.py @@ -95,8 +95,8 @@ def pooler_for_embed(pooler_config: PoolerConfig): vllm_config = get_current_vllm_config() model_config = vllm_config.model_config head = EmbeddingPoolerHead( - projector=_load_st_projector(model_config), head_dtype=model_config.head_dtype, + projector=_load_st_projector(model_config), activation=PoolerNormalize(), ) @@ -116,9 +116,9 @@ def pooler_for_classify( vllm_config = get_current_vllm_config() model_config = vllm_config.model_config head = ClassifierPoolerHead( + head_dtype=model_config.head_dtype, classifier=classifier, logit_bias=model_config.pooler_config.logit_bias, - head_dtype=model_config.head_dtype, activation=resolve_classifier_act_fn( model_config, static_num_labels=True, act_fn=act_fn ), diff --git a/vllm/model_executor/layers/pooler/tokwise/heads.py b/vllm/model_executor/layers/pooler/tokwise/heads.py index 923b9b977..4183f5b1b 100644 --- a/vllm/model_executor/layers/pooler/tokwise/heads.py +++ b/vllm/model_executor/layers/pooler/tokwise/heads.py @@ -44,14 +44,14 @@ class TokenPoolerHead(nn.Module, ABC): class TokenEmbeddingPoolerHead(TokenPoolerHead): def __init__( self, - projector: ProjectorFn | None = None, head_dtype: torch.dtype | str | None = None, + projector: ProjectorFn | None = None, activation: ActivationFn | None = None, ) -> None: super().__init__() - self.projector = projector self.head_dtype = head_dtype + self.projector = projector self.activation = activation def get_supported_tasks(self) -> Set[PoolingTask]: @@ -79,7 +79,7 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead): pooled_data = pooled_data[..., : pooling_param.dimensions] # for normalize - if self.activation is not None and pooling_param.normalize: + if self.activation is not None and pooling_param.use_activation: pooled_data = self.activation(pooled_data) # pooled_data shape: [n_tokens, embedding_dimension] diff --git a/vllm/model_executor/layers/pooler/tokwise/poolers.py b/vllm/model_executor/layers/pooler/tokwise/poolers.py index 8b4fe5568..20790eff6 100644 --- a/vllm/model_executor/layers/pooler/tokwise/poolers.py +++ b/vllm/model_executor/layers/pooler/tokwise/poolers.py @@ -95,8 +95,8 @@ def pooler_for_token_embed(pooler_config: PoolerConfig): vllm_config = get_current_vllm_config() model_config = vllm_config.model_config head = TokenEmbeddingPoolerHead( - projector=_load_st_projector(model_config), head_dtype=model_config.head_dtype, + projector=_load_st_projector(model_config), activation=PoolerNormalize(), ) @@ -116,9 +116,9 @@ def pooler_for_token_classify( vllm_config = get_current_vllm_config() model_config = vllm_config.model_config head = TokenClassifierPoolerHead( + head_dtype=model_config.head_dtype, classifier=classifier, logit_bias=model_config.pooler_config.logit_bias, - head_dtype=model_config.head_dtype, activation=resolve_classifier_act_fn( model_config, static_num_labels=False, act_fn=act_fn ), diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 10952bcd9..532175e72 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -116,8 +116,8 @@ class BertPooler(SequencePooler): # Use lambdas so that weights are not registered under `self.head` self.head = EmbeddingPoolerHead( - projector=lambda x: self.dense(x), head_dtype=head_dtype, + projector=lambda x: self.dense(x), activation=LambdaPoolerActivation(self.act_fn), ) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index f0d9ecfa9..f3fec2bfb 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -309,12 +309,13 @@ class ModernBertPooler(SequencePooler): config.hidden_size, eps=config.norm_eps, bias=config.norm_bias, + dtype=head_dtype, ) # Use lambdas so that weights are not registered under `self.head` self.head = EmbeddingPoolerHead( - projector=lambda x: self.dense(x), head_dtype=head_dtype, + projector=lambda x: self.dense(x), activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))), ) diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 09fd8d0bd..84101e1ae 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -26,9 +26,9 @@ class PoolingParams( Set to None to disable truncation. dimensions: Reduce the dimensions of embeddings if model support matryoshka representation. - normalize: Whether to normalize the embeddings outputs. - softmax: softmax will be deprecated, please use use_activation instead. - activation: activation will be deprecated, please use use_activation instead. + normalize: Deprecated, please use use_activation instead. + softmax: Deprecated, please use use_activation instead. + activation: Deprecated, please use use_activation instead. use_activation: Whether to apply activation function to the classification outputs. """ @@ -63,15 +63,15 @@ class PoolingParams( @property def all_parameters(self) -> list[str]: - return ["dimensions", "normalize", "use_activation"] + return ["dimensions", "use_activation"] @property def valid_parameters(self): return { - "embed": ["dimensions", "normalize"], + "embed": ["dimensions", "use_activation"], "classify": ["use_activation"], "score": ["use_activation"], - "token_embed": ["dimensions", "normalize"], + "token_embed": ["dimensions", "use_activation"], "token_classify": ["use_activation"], } @@ -162,8 +162,8 @@ class PoolingParams( def _set_default_parameters(self, model_config: Optional["ModelConfig"]): if self.task in ["embed", "token_embed"]: - if self.normalize is None: - self.normalize = True + if self.use_activation is None: + self.use_activation = True if self.dimensions is not None and model_config is not None: if not model_config.is_matryoshka: @@ -213,7 +213,6 @@ class PoolingParams( return ( f"PoolingParams(" f"task={self.task}, " - f"normalize={self.normalize}, " f"dimensions={self.dimensions}, " f"use_activation={self.use_activation}, " f"step_tag_id={self.step_tag_id}, " diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 02ffc37d9..0a7cfffd4 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -801,7 +801,7 @@ def get_pooling_config( logger.info("Found pooling configuration.") - config: dict[str, Any] = {"normalize": normalize} + config: dict[str, Any] = {"use_activation": normalize} for key, val in pooling_dict.items(): if val is True: pooling_type = parse_pooling_type(key)