[V1] Revert the default max_num_seqs to V0 values for most hardware (#16158)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-08 01:54:36 +08:00
parent 027b204ff1
commit 66d433b94f
3 changed files with 6 additions and 10 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1666,12 +1666,14 @@ class EngineArgs:
                UsageContext.LLM_CLASS: 16384,
                UsageContext.OPENAI_API_SERVER: 8192,
            }
+            default_max_num_seqs = 1024
        else:
            # TODO(woosuk): Tune the default values for other hardware.
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 8192,
                UsageContext.OPENAI_API_SERVER: 2048,
            }
+            default_max_num_seqs = 256

        use_context_value = usage_context.value if usage_context else None
        if (self.max_num_batched_tokens is None
@@ -1682,7 +1684,6 @@ class EngineArgs:
                "Setting max_num_batched_tokens to %d for %s usage context.",
                self.max_num_batched_tokens, use_context_value)

-        default_max_num_seqs = 1024
        if self.max_num_seqs is None:
            self.max_num_seqs = default_max_num_seqs