[Model] Support math-shepherd-mistral-7b-prm model (#9697)

Signed-off-by: Went-Liang <wenteng_liang@163.com>
2024-10-31 00:33:42 +08:00
parent cc98f1e079
commit 81f09cfd80
14 changed files with 312 additions and 62 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -184,6 +184,13 @@ class EngineArgs:
    mm_processor_kwargs: Optional[Dict[str, Any]] = None
    scheduling_policy: Literal["fcfs", "priority"] = "fcfs"

+    # Pooling configuration.
+    pooling_type: Optional[str] = None
+    pooling_norm: Optional[bool] = None
+    pooling_softmax: Optional[bool] = None
+    pooling_step_tag_id: Optional[int] = None
+    pooling_returned_token_ids: Optional[List[int]] = None
+
    def __post_init__(self):
        if not self.tokenizer:
            self.tokenizer = self.model
@@ -850,6 +857,58 @@ class EngineArgs:
            'priority (lower value means earlier handling) and time of '
            'arrival deciding any ties).')

+        parser.add_argument(
+            '--pooling-type',
+            choices=['LAST', 'ALL', 'CLS', 'STEP'],
+            default=None,
+            help='Used to configure the pooling method in the embedding model.'
+        )
+
+        parser.add_argument('--pooling-norm',
+                            default=None,
+                            action='store_true',
+                            help="Used to determine whether to normalize "
+                            "the pooled data in the embedding model.")
+
+        parser.add_argument('--no-pooling-norm',
+                            default=None,
+                            action='store_false',
+                            dest='pooling_norm',
+                            help="Used to determine whether to normalize "
+                            "the pooled data in the embedding model.")
+
+        parser.add_argument('--pooling-softmax',
+                            default=None,
+                            action='store_true',
+                            help="Used to determine whether to softmax "
+                            "the pooled data in the embedding model.")
+
+        parser.add_argument('--no-pooling-softmax',
+                            default=None,
+                            action='store_false',
+                            dest='pooling_softmax',
+                            help="Used to determine whether to softmax "
+                            "the pooled data in the embedding model.")
+
+        parser.add_argument(
+            '--pooling-step-tag-id',
+            type=int,
+            default=None,
+            help="When pooling-step-tag-id is not -1, it indicates "
+            "that the score corresponding to the step-tag-ids in the "
+            "generated sentence should be returned. Otherwise, it "
+            "returns the scores for all tokens.")
+
+        parser.add_argument(
+            '--pooling-returned-token-ids',
+            nargs='+',
+            type=int,
+            default=None,
+            help="pooling-returned-token-ids represents a list of "
+            "indices for the vocabulary dimensions to be extracted, "
+            "such as the token IDs of good_token and bad_token in "
+            "the math-shepherd-mistral-7b-prm model.")
+
        return parser

    @classmethod
@@ -891,6 +950,11 @@ class EngineArgs:
            override_neuron_config=self.override_neuron_config,
            config_format=self.config_format,
            mm_processor_kwargs=self.mm_processor_kwargs,
+            pooling_type=self.pooling_type,
+            pooling_norm=self.pooling_norm,
+            pooling_softmax=self.pooling_softmax,
+            pooling_step_tag_id=self.pooling_step_tag_id,
+            pooling_returned_token_ids=self.pooling_returned_token_ids,
        )

    def create_load_config(self) -> LoadConfig: