[Model] Add user-configurable task for models that support both generation and embedding (#9424)

2024-10-19 02:31:58 +08:00
parent 7dbe738d65
commit 051eaf6db3
33 changed files with 451 additions and 201 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,8 +1,8 @@
 import enum
 import json
 from dataclasses import dataclass, field, fields
-from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Mapping,
-                    Optional, Tuple, Type, Union)
+from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal,
+                    Mapping, Optional, Set, Tuple, Type, Union)

 import torch
 from transformers import PretrainedConfig
@@ -33,6 +33,9 @@ logger = init_logger(__name__)
 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120

+Task = Literal["generate", "embedding"]
+TaskOption = Literal["auto", Task]
+

 class ModelConfig:
    """Configuration for the model.
@@ -40,7 +43,11 @@ class ModelConfig:
    Args:
        model: Name or path of the huggingface model to use.
            It is also used as the content for `model_name` tag in metrics 
-            output when `served_model_name` is not specified. 
+            output when `served_model_name` is not specified.
+        task: The task to use the model for. Each vLLM instance only supports
+            one task, even if the same model can be used for multiple tasks.
+            When the model only supports one task, "auto" can be used to select
+            it; otherwise, you must specify explicitly which task to use.
        tokenizer: Name or path of the huggingface tokenizer to use.
        tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
            available, "slow" will always use the slow tokenizer, and
@@ -108,6 +115,7 @@ class ModelConfig:

    def __init__(self,
                 model: str,
+                 task: TaskOption,
                 tokenizer: str,
                 tokenizer_mode: str,
                 trust_remote_code: bool,
@@ -207,7 +215,11 @@ class ModelConfig:

        self.override_neuron_config = override_neuron_config if is_neuron(
        ) else None
-        self._verify_embedding_mode()
+
+        supported_tasks, task = self._resolve_task(task, self.hf_config)
+        self.supported_tasks = supported_tasks
+        self.task: Final = task
+
        self._verify_quantization()
        self._verify_cuda_graph()
        self._verify_bnb_config()
@@ -241,18 +253,41 @@ class ModelConfig:
                "either 'auto', 'slow' or 'mistral'.")
        self.tokenizer_mode = tokenizer_mode

-    def _verify_embedding_mode(self) -> None:
-        architectures = getattr(self.hf_config, "architectures", [])
+    def _resolve_task(
+        self,
+        task_option: TaskOption,
+        hf_config: PretrainedConfig,
+    ) -> Tuple[Set[Task], Task]:
+        architectures = getattr(hf_config, "architectures", [])

-        # TODO: Allow the same model architecture to be specified as either
-        # generation or embedding model
-        if "Phi3VForCausalLM" in architectures:
-            # Match both remote and local names
-            embedding_mode = "/VLM2Vec" in self.model
+        task_support: Dict[Task, bool] = {
+            # NOTE: Listed from highest to lowest priority,
+            # in case the model supports multiple of them
+            "generate": ModelRegistry.is_text_generation_model(architectures),
+            "embedding": ModelRegistry.is_embedding_model(architectures),
+        }
+        supported_tasks_lst: List[Task] = [
+            task for task, is_supported in task_support.items() if is_supported
+        ]
+        supported_tasks = set(supported_tasks_lst)
+
+        if task_option == "auto":
+            selected_task = next(iter(supported_tasks_lst))
+
+            if len(supported_tasks) > 1:
+                logger.info(
+                    "This model supports multiple tasks: %s. "
+                    "Defaulting to '%s'.", supported_tasks, selected_task)
        else:
-            embedding_mode = ModelRegistry.is_embedding_model(architectures)
+            if task_option not in supported_tasks:
+                msg = (
+                    f"This model does not support the '{task_option}' task. "
+                    f"Supported tasks: {supported_tasks}")
+                raise ValueError(msg)

-        self.embedding_mode = embedding_mode
+            selected_task = task_option
+
+        return supported_tasks, selected_task

    def _parse_quant_hf_config(self):
        quant_cfg = getattr(self.hf_config, "quantization_config", None)
@@ -401,7 +436,7 @@ class ModelConfig:

        # Async postprocessor is not necessary with embedding mode
        # since there is no token generation
-        if self.embedding_mode:
+        if self.task == "embedding":
            self.use_async_output_proc = False

        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
@@ -582,11 +617,6 @@ class ModelConfig:
            (hasattr(self.hf_config, "text_config") and getattr(
                self.hf_config.text_config, "is_encoder_decoder", False)))

-    @property
-    def is_embedding_model(self) -> bool:
-        """Extract the embedding model flag."""
-        return self.embedding_mode
-
    @property
    def is_multimodal_model(self) -> bool:
        return self.multimodal_config is not None
@@ -943,6 +973,7 @@ class SchedulerConfig:
    """Scheduler configuration.

    Args:
+        task: The task to use the model for.
        max_num_batched_tokens: Maximum number of tokens to be processed in
            a single iteration.
        max_num_seqs: Maximum number of sequences to be processed in a single
@@ -957,7 +988,6 @@ class SchedulerConfig:
            prompt latency) before scheduling next prompt.
        enable_chunked_prefill: If True, prefill requests can be chunked based
            on the remaining max_num_batched_tokens.
-        embedding_mode: Whether the running model is for embedding.
        preemption_mode: Whether to perform preemption by swapping or 
            recomputation. If not specified, we determine the mode as follows:
            We use recomputation by default since it incurs lower overhead than
@@ -972,13 +1002,13 @@ class SchedulerConfig:
    """

    def __init__(self,
+                 task: Task,
                 max_num_batched_tokens: Optional[int],
                 max_num_seqs: int,
                 max_model_len: int,
                 num_lookahead_slots: int = 0,
                 delay_factor: float = 0.0,
                 enable_chunked_prefill: bool = False,
-                 embedding_mode: bool = False,
                 is_multimodal_model: bool = False,
                 preemption_mode: Optional[str] = None,
                 num_scheduler_steps: int = 1,
@@ -1002,7 +1032,7 @@ class SchedulerConfig:
                # for higher throughput.
                max_num_batched_tokens = max(max_model_len, 2048)

-            if embedding_mode:
+            if task == "embedding":
                # For embedding, choose specific value for higher throughput
                max_num_batched_tokens = max(
                    max_num_batched_tokens,
@@ -1022,12 +1052,12 @@ class SchedulerConfig:
                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
                self.max_num_batched_tokens)

+        self.task: Final = task
        self.max_num_seqs = max_num_seqs
        self.max_model_len = max_model_len
        self.num_lookahead_slots = num_lookahead_slots
        self.delay_factor = delay_factor
        self.chunked_prefill_enabled = enable_chunked_prefill
-        self.embedding_mode = embedding_mode
        self.preemption_mode = preemption_mode
        self.num_scheduler_steps = num_scheduler_steps
        self.multi_step_stream_outputs = multi_step_stream_outputs
@@ -1239,6 +1269,7 @@ class SpeculativeConfig:
            ngram_prompt_lookup_min = 0
            draft_model_config = ModelConfig(
                model=speculative_model,
+                task=target_model_config.task,
                tokenizer=target_model_config.tokenizer,
                tokenizer_mode=target_model_config.tokenizer_mode,
                trust_remote_code=target_model_config.trust_remote_code,