diff --git a/vllm/config/model.py b/vllm/config/model.py index 883e0b17e..b6e5ca74f 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1339,10 +1339,9 @@ class ModelConfig: Returns: A dictionary containing the non-default sampling parameters. """ - if self.generation_config == "vllm": - config = {} - else: - config = self.try_get_generation_config() + src = self.generation_config + + config = {} if src == "vllm" else self.try_get_generation_config() # Overriding with given generation config config.update(self.override_generation_config) @@ -1368,13 +1367,16 @@ class ModelConfig: else: diff_sampling_param = {} - if diff_sampling_param: + if diff_sampling_param and src != "vllm": logger.warning_once( - "Default sampling parameters have been overridden by the " - "model's Hugging Face generation config recommended from the " - "model creator. If this is not intended, please relaunch " - "vLLM instance with `--generation-config vllm`." + "Default vLLM sampling parameters have been overridden by %s: `%s`. " + "If this is not intended, please relaunch vLLM instance " + "with `--generation-config vllm`.", + "the model's `generation_config.json`" if src == "auto" else src, + str(diff_sampling_param), + scope="local", ) + return diff_sampling_param @property diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index a15c99c24..265cee554 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -143,14 +143,6 @@ class OpenAIServingChat(OpenAIServing): self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage self.default_sampling_params = self.model_config.get_diff_sampling_param() - if self.default_sampling_params: - source = self.model_config.generation_config - source = "model" if source == "auto" else source - logger.info( - "Using default chat sampling params from %s: %s", - source, - self.default_sampling_params, - ) if self.model_config.hf_config.model_type == "kimi_k2": self.tool_call_id_type = "kimi_k2" else: diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index fb14a2307..92156c7f2 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -72,16 +72,9 @@ class OpenAIServingCompletion(OpenAIServing): self.logits_processors = self.model_config.logits_processors self.enable_prompt_tokens_details = enable_prompt_tokens_details - self.default_sampling_params = self.model_config.get_diff_sampling_param() self.enable_force_include_usage = enable_force_include_usage - if self.default_sampling_params: - source = self.model_config.generation_config - source = "model" if source == "auto" else source - logger.info( - "Using default completion sampling params from %s: %s", - source, - self.default_sampling_params, - ) + + self.default_sampling_params = self.model_config.get_diff_sampling_param() async def render_completion_request( self, diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index cb0317f9f..9fa748f87 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -221,15 +221,8 @@ class OpenAIServingResponses(OpenAIServing): ) self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage + self.default_sampling_params = self.model_config.get_diff_sampling_param() - if self.default_sampling_params: - source = self.model_config.generation_config - source = "model" if source == "auto" else source - logger.info( - "Using default chat sampling params from %s: %s", - source, - self.default_sampling_params, - ) # If False (default), the "store" option is (silently) ignored and the # response is not stored. If True, the response is stored in memory.