From c9d838fc338db9a5a23cb3906d17c47423c4c9e4 Mon Sep 17 00:00:00 2001
From: RonaldBXu <72748153+RonaldBXu@users.noreply.github.com>
Date: Wed, 18 Mar 2026 09:02:03 -0700
Subject: [PATCH] Adding deterministic lora benchmarking to vLLM Bench (#36057)

Signed-off-by: Ubuntu <ubuntu@ip-172-31-43-201.ap-northeast-1.compute.internal>
Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 vllm/benchmarks/datasets.py   | 88 +++++++++++++++++++++++++++++++++--
 vllm/benchmarks/serve.py      | 33 +++++++++++--
 vllm/benchmarks/throughput.py | 10 ++++
 3 files changed, 122 insertions(+), 9 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index edd84403f..1e0a63dd6 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -183,6 +183,68 @@ class BenchmarkDataset(ABC):
         )
         return lora_request
 
+    def get_round_robin_lora_request(
+        self,
+        index: int,
+        max_loras: int | None = None,
+        lora_path: str | None = None,
+    ) -> LoRARequest | None:
+        """
+        Optionally select a LoRA request using deterministic round-robin.
+
+        This method cycles through LoRA IDs in order based on the request
+        index, providing reproducible LoRA assignment.
+
+        Args:
+            index (int): The request index used for round-robin selection.
+            max_loras (Optional[int]): The maximum number of LoRAs available.
+                If `None`, LoRA is not used.
+            lora_path (Optional[str]): Path to the LoRA parameters on disk.
+                If `None`, LoRA is not used.
+
+        Returns:
+            A new [`LoRARequest`][vllm.lora.request.LoRARequest]
+            (or `None` if not applicable).
+        """
+        if max_loras is None or lora_path is None:
+            return None
+
+        # Deterministic round-robin: cycle through [1, max_loras]
+        lora_id = index % max_loras + 1
+        lora_request = LoRARequest(
+            lora_name=str(lora_id),
+            lora_int_id=lora_id,
+            lora_path=lora_path_on_disk(lora_path),
+        )
+        return lora_request
+
+    def get_lora_request(
+        self,
+        index: int,
+        max_loras: int | None = None,
+        lora_path: str | None = None,
+        lora_assignment: str = "random",
+    ) -> LoRARequest | None:
+        """
+        Select a LoRA request using the specified assignment strategy.
+
+        Args:
+            index (int): The request index (used for round-robin).
+            max_loras (Optional[int]): The maximum number of LoRAs available.
+            lora_path (Optional[str]): Path to the LoRA parameters on disk.
+            lora_assignment (str): Strategy for LoRA selection.
+                'random' (default) or 'round-robin'.
+
+        Returns:
+            A new [`LoRARequest`][vllm.lora.request.LoRARequest]
+            (or `None` if not applicable).
+        """
+        if lora_assignment == "round-robin":
+            return self.get_round_robin_lora_request(
+                index=index, max_loras=max_loras, lora_path=lora_path
+            )
+        return self.get_random_lora_request(max_loras=max_loras, lora_path=lora_path)
+
     @abstractmethod
     def sample(
         self,
@@ -478,6 +540,9 @@ class RandomDataset(BenchmarkDataset):
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
         batchsize: int = 1,
+        max_loras: int | None = None,
+        lora_path: str | None = None,
+        lora_assignment: str = "random",
         **kwargs,
     ) -> list[SampleRequest]:
         # validate total input tokens (prefix + sampled) is at least 1.
@@ -522,11 +587,18 @@ class RandomDataset(BenchmarkDataset):
                 allowed_tokens=allowed_tokens,
             )
             token_mismatch_total += token_mismatch
+            lora_req = self.get_lora_request(
+                index=i,
+                max_loras=max_loras,
+                lora_path=lora_path,
+                lora_assignment=lora_assignment,
+            )
             requests.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=total_input_len,
                     expected_output_len=int(output_lens[i]),
+                    lora_request=lora_req,
                     request_id=request_id_prefix + str(i),
                 )
             )
@@ -1263,6 +1335,7 @@ class ShareGPTDataset(BenchmarkDataset):
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
+        lora_assignment: str = "random",
         **kwargs,
     ) -> list:
         samples: list = []
@@ -1275,8 +1348,11 @@ class ShareGPTDataset(BenchmarkDataset):
                 entry["conversations"][1]["value"],
             )
 
-            lora_request = self.get_random_lora_request(
-                max_loras=max_loras, lora_path=lora_path
+            lora_request = self.get_lora_request(
+                index=ind,
+                max_loras=max_loras,
+                lora_path=lora_path,
+                lora_assignment=lora_assignment,
             )
             prompt_ids = tokenizer(prompt).input_ids
             completion_ids = tokenizer(completion).input_ids
@@ -2413,6 +2489,7 @@ class BurstGPTDataset(BenchmarkDataset):
         lora_path: str | None = None,
         request_id_prefix: str = "",
         no_oversample: bool = False,
+        lora_assignment: str = "random",
         **kwargs,
     ) -> list[SampleRequest]:
         samples = []
@@ -2420,8 +2497,11 @@ class BurstGPTDataset(BenchmarkDataset):
         for i in range(num_requests):
             input_len = int(data[i][2])
             output_len = int(data[i][3])
-            lora_req = self.get_random_lora_request(
-                max_loras=max_loras, lora_path=lora_path
+            lora_req = self.get_lora_request(
+                index=i,
+                max_loras=max_loras,
+                lora_path=lora_path,
+                lora_assignment=lora_assignment,
             )
             vocab_size = tokenizer.vocab_size
             # Generate a synthetic prompt: a list of token IDs computed as (i +
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index fca01e17e..53ae6ca6a 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -624,6 +624,7 @@ async def benchmark(
     lora_modules: Iterable[str] | None,
     extra_headers: dict | None,
     extra_body: dict | None,
+    lora_assignment: Literal["random", "round-robin"] = "random",
     ramp_up_strategy: Literal["linear", "exponential"] | None = None,
     ramp_up_start_rps: int | None = None,
     ramp_up_end_rps: int | None = None,
@@ -731,10 +732,20 @@ async def benchmark(
     print("Starting main benchmark run...")
 
     if lora_modules:
-        # For each input request, choose a LoRA module at random.
-        lora_modules = iter(
-            [random.choice(lora_modules) for _ in range(len(input_requests))]
-        )
+        lora_modules_list = list(lora_modules)
+        if lora_assignment == "round-robin":
+            # Deterministic round-robin assignment across requests.
+            lora_modules = iter(
+                [
+                    lora_modules_list[i % len(lora_modules_list)]
+                    for i in range(len(input_requests))
+                ]
+            )
+        else:
+            # For each input request, choose a LoRA module at random.
+            lora_modules = iter(
+                [random.choice(lora_modules_list) for _ in range(len(input_requests))]
+            )
 
     if profile:
         print("Starting profiler...")
@@ -1523,7 +1534,18 @@ def add_cli_args(parser: argparse.ArgumentParser):
         default=None,
         help="A subset of LoRA module names passed in when "
         "launching the server. For each request, the "
-        "script chooses a LoRA module at random.",
+        "script chooses a LoRA module at random by default. "
+        "Use --lora-assignment to control selection strategy.",
+    )
+
+    parser.add_argument(
+        "--lora-assignment",
+        type=str,
+        default="random",
+        choices=["random", "round-robin"],
+        help="Strategy for assigning LoRA modules to requests. "
+        "'random' (default) selects a LoRA at random for each request. "
+        "'round-robin' cycles through LoRA modules deterministically.",
     )
 
     parser.add_argument(
@@ -1788,6 +1810,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         goodput_config_dict=goodput_config_dict,
         max_concurrency=args.max_concurrency,
         lora_modules=args.lora_modules,
+        lora_assignment=args.lora_assignment,
         extra_headers=headers,
         extra_body=extra_body,
         ramp_up_strategy=args.ramp_up_strategy,
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index ad6f44404..1af8cf900 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -350,6 +350,7 @@ def get_requests(args, tokenizer):
         "tokenizer": tokenizer,
         "lora_path": args.lora_path,
         "max_loras": args.max_loras,
+        "lora_assignment": getattr(args, "lora_assignment", "random"),
         "num_requests": args.num_prompts,
     }
 
@@ -778,6 +779,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Path to the lora adapters to use. This can be an absolute path, "
         "a relative path, or a Hugging Face model identifier.",
     )
+    parser.add_argument(
+        "--lora-assignment",
+        type=str,
+        default="random",
+        choices=["random", "round-robin"],
+        help="Strategy for assigning LoRA adapters to requests. "
+        "'random' (default) selects a LoRA at random for each request. "
+        "'round-robin' cycles through LoRAs deterministically.",
+    )
     parser.add_argument(
         "--prefix-len",
         type=int,