[Feature] Add load generation config from model (#11164)

Signed-off-by: liuyanyi <wolfsonliu@163.com> Signed-off-by: Yanyi Liu <wolfsonliu@163.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-12-19 18:50:38 +08:00
parent 98356735ac
commit 5aef49806d
10 changed files with 307 additions and 74 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -27,7 +27,8 @@ from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
    ConfigFormat, get_config, get_hf_image_processor_config,
    get_hf_text_config, get_pooling_config,
-    get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
+    get_sentence_transformer_tokenizer_config, is_encoder_decoder,
+    try_get_generation_config, uses_mrope)
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                        get_cpu_memory, print_warning_once, random_uuid,
                        resolve_obj_by_qualname)
@@ -160,6 +161,7 @@ class ModelConfig:
            logits processor qualified names that can be passed with the
            `logits_processors` extra completion argument. Defaults to None, 
            which allows no processors.
+        generation_config: Configuration parameter file for generation.
    """

    def compute_hash(self) -> str:
@@ -218,7 +220,8 @@ class ModelConfig:
                 disable_mm_preprocessor_cache: bool = False,
                 override_neuron_config: Optional[Dict[str, Any]] = None,
                 override_pooler_config: Optional["PoolerConfig"] = None,
-                 logits_processor_pattern: Optional[str] = None) -> None:
+                 logits_processor_pattern: Optional[str] = None,
+                 generation_config: Optional[str] = None) -> None:
        self.model = model
        self.tokenizer = tokenizer
        self.tokenizer_mode = tokenizer_mode
@@ -348,6 +351,8 @@ class ModelConfig:
        self.pooler_config = self._init_pooler_config(override_pooler_config)
        self.logits_processor_pattern = logits_processor_pattern

+        self.generation_config = generation_config
+
        self._verify_quantization()
        self._verify_cuda_graph()
        self._verify_bnb_config()
@@ -813,6 +818,56 @@ class ModelConfig:

        return self.multimodal_config

+    def try_get_generation_config(self) -> Dict[str, Any]:
+        if self.generation_config is None or self.generation_config == "auto":
+            config = try_get_generation_config(
+                self.model,
+                trust_remote_code=self.trust_remote_code,
+                revision=self.revision,
+            )
+        else:
+            config = try_get_generation_config(
+                self.generation_config,
+                trust_remote_code=self.trust_remote_code,
+            )
+
+        if config is None:
+            return {}
+
+        return config.to_diff_dict()
+
+    def get_diff_sampling_param(self) -> Dict[str, Any]:
+        """
+        This method returns a dictionary containing the parameters 
+        that differ from the default sampling parameters, but only 
+        if `generation_config` is set. If `generation_config` is not 
+        set, an empty dictionary is returned.
+
+        Returns:
+            Dict[str, Any]: A dictionary with the differing sampling 
+            parameters if `generation_config` is set, otherwise an 
+            empty dictionary.
+        """
+        if self.generation_config is None:
+            # When generation_config is not set
+            return {}
+        config = self.try_get_generation_config()
+        available_params = [
+            "repetition_penalty",
+            "temperature",
+            "top_k",
+            "top_p",
+            "min_p",
+        ]
+        if any(p in config for p in available_params):
+            diff_sampling_param = {
+                p: config.get(p)
+                for p in available_params if config.get(p) is not None
+            }
+        else:
+            diff_sampling_param = {}
+        return diff_sampling_param
+
    @property
    def is_encoder_decoder(self) -> bool:
        """Extract the HF encoder/decoder model flag."""