[Core/DBO][1/N] Add Dual-Batch Overlap mechanism to VLLM (#23693)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-16 09:21:48 -07:00
parent 08369289af
commit 567939953b
22 changed files with 1257 additions and 172 deletions
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@@ -2848,6 +2848,14 @@ class VllmConfig:
                    "when cudagraph_mode piecewise cudagraphs is used, "\
                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"

+        if self.parallel_config.enable_dbo:
+            a2a_backend = envs.VLLM_ALL2ALL_BACKEND
+            assert a2a_backend == "deepep_low_latency", \
+            "Microbatching currently only supports the deepep_low_latency "\
+            f"all2all backend. {a2a_backend} is not supported. To fix set "\
+            "the VLLM_ALL2ALL_BACKEND environment variable to "\
+            "deepep_low_latency and install the DeepEP kerenls."
+
        if not self.instance_id:
            self.instance_id = random_uuid()[:5]

--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -137,6 +137,14 @@ class ParallelConfig:
    disable_custom_all_reduce: bool = False
    """Disable the custom all-reduce kernel and fall back to NCCL."""

+    enable_dbo: bool = False
+    """Enable microbatching for the model executor."""
+
+    dbo_decode_token_threshold: int = 32
+    """The threshold for microbatching. If the number of tokens in the
+    request is greater than this threshold, microbatching will be used.
+    Otherwise, the request will be processed in a single batch."""
+
    ray_workers_use_nsight: bool = False
    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""