From c9d838fc338db9a5a23cb3906d17c47423c4c9e4 Mon Sep 17 00:00:00 2001 From: RonaldBXu <72748153+RonaldBXu@users.noreply.github.com> Date: Wed, 18 Mar 2026 09:02:03 -0700 Subject: [PATCH] Adding deterministic lora benchmarking to vLLM Bench (#36057) Signed-off-by: Ubuntu Signed-off-by: Ronald Xu --- vllm/benchmarks/datasets.py | 88 +++++++++++++++++++++++++++++++++-- vllm/benchmarks/serve.py | 33 +++++++++++-- vllm/benchmarks/throughput.py | 10 ++++ 3 files changed, 122 insertions(+), 9 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index edd84403f..1e0a63dd6 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -183,6 +183,68 @@ class BenchmarkDataset(ABC): ) return lora_request + def get_round_robin_lora_request( + self, + index: int, + max_loras: int | None = None, + lora_path: str | None = None, + ) -> LoRARequest | None: + """ + Optionally select a LoRA request using deterministic round-robin. + + This method cycles through LoRA IDs in order based on the request + index, providing reproducible LoRA assignment. + + Args: + index (int): The request index used for round-robin selection. + max_loras (Optional[int]): The maximum number of LoRAs available. + If `None`, LoRA is not used. + lora_path (Optional[str]): Path to the LoRA parameters on disk. + If `None`, LoRA is not used. + + Returns: + A new [`LoRARequest`][vllm.lora.request.LoRARequest] + (or `None` if not applicable). + """ + if max_loras is None or lora_path is None: + return None + + # Deterministic round-robin: cycle through [1, max_loras] + lora_id = index % max_loras + 1 + lora_request = LoRARequest( + lora_name=str(lora_id), + lora_int_id=lora_id, + lora_path=lora_path_on_disk(lora_path), + ) + return lora_request + + def get_lora_request( + self, + index: int, + max_loras: int | None = None, + lora_path: str | None = None, + lora_assignment: str = "random", + ) -> LoRARequest | None: + """ + Select a LoRA request using the specified assignment strategy. + + Args: + index (int): The request index (used for round-robin). + max_loras (Optional[int]): The maximum number of LoRAs available. + lora_path (Optional[str]): Path to the LoRA parameters on disk. + lora_assignment (str): Strategy for LoRA selection. + 'random' (default) or 'round-robin'. + + Returns: + A new [`LoRARequest`][vllm.lora.request.LoRARequest] + (or `None` if not applicable). + """ + if lora_assignment == "round-robin": + return self.get_round_robin_lora_request( + index=index, max_loras=max_loras, lora_path=lora_path + ) + return self.get_random_lora_request(max_loras=max_loras, lora_path=lora_path) + @abstractmethod def sample( self, @@ -478,6 +540,9 @@ class RandomDataset(BenchmarkDataset): input_len: int = DEFAULT_INPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN, batchsize: int = 1, + max_loras: int | None = None, + lora_path: str | None = None, + lora_assignment: str = "random", **kwargs, ) -> list[SampleRequest]: # validate total input tokens (prefix + sampled) is at least 1. @@ -522,11 +587,18 @@ class RandomDataset(BenchmarkDataset): allowed_tokens=allowed_tokens, ) token_mismatch_total += token_mismatch + lora_req = self.get_lora_request( + index=i, + max_loras=max_loras, + lora_path=lora_path, + lora_assignment=lora_assignment, + ) requests.append( SampleRequest( prompt=prompt, prompt_len=total_input_len, expected_output_len=int(output_lens[i]), + lora_request=lora_req, request_id=request_id_prefix + str(i), ) ) @@ -1263,6 +1335,7 @@ class ShareGPTDataset(BenchmarkDataset): enable_multimodal_chat: bool = False, request_id_prefix: str = "", no_oversample: bool = False, + lora_assignment: str = "random", **kwargs, ) -> list: samples: list = [] @@ -1275,8 +1348,11 @@ class ShareGPTDataset(BenchmarkDataset): entry["conversations"][1]["value"], ) - lora_request = self.get_random_lora_request( - max_loras=max_loras, lora_path=lora_path + lora_request = self.get_lora_request( + index=ind, + max_loras=max_loras, + lora_path=lora_path, + lora_assignment=lora_assignment, ) prompt_ids = tokenizer(prompt).input_ids completion_ids = tokenizer(completion).input_ids @@ -2413,6 +2489,7 @@ class BurstGPTDataset(BenchmarkDataset): lora_path: str | None = None, request_id_prefix: str = "", no_oversample: bool = False, + lora_assignment: str = "random", **kwargs, ) -> list[SampleRequest]: samples = [] @@ -2420,8 +2497,11 @@ class BurstGPTDataset(BenchmarkDataset): for i in range(num_requests): input_len = int(data[i][2]) output_len = int(data[i][3]) - lora_req = self.get_random_lora_request( - max_loras=max_loras, lora_path=lora_path + lora_req = self.get_lora_request( + index=i, + max_loras=max_loras, + lora_path=lora_path, + lora_assignment=lora_assignment, ) vocab_size = tokenizer.vocab_size # Generate a synthetic prompt: a list of token IDs computed as (i + diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index fca01e17e..53ae6ca6a 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -624,6 +624,7 @@ async def benchmark( lora_modules: Iterable[str] | None, extra_headers: dict | None, extra_body: dict | None, + lora_assignment: Literal["random", "round-robin"] = "random", ramp_up_strategy: Literal["linear", "exponential"] | None = None, ramp_up_start_rps: int | None = None, ramp_up_end_rps: int | None = None, @@ -731,10 +732,20 @@ async def benchmark( print("Starting main benchmark run...") if lora_modules: - # For each input request, choose a LoRA module at random. - lora_modules = iter( - [random.choice(lora_modules) for _ in range(len(input_requests))] - ) + lora_modules_list = list(lora_modules) + if lora_assignment == "round-robin": + # Deterministic round-robin assignment across requests. + lora_modules = iter( + [ + lora_modules_list[i % len(lora_modules_list)] + for i in range(len(input_requests)) + ] + ) + else: + # For each input request, choose a LoRA module at random. + lora_modules = iter( + [random.choice(lora_modules_list) for _ in range(len(input_requests))] + ) if profile: print("Starting profiler...") @@ -1523,7 +1534,18 @@ def add_cli_args(parser: argparse.ArgumentParser): default=None, help="A subset of LoRA module names passed in when " "launching the server. For each request, the " - "script chooses a LoRA module at random.", + "script chooses a LoRA module at random by default. " + "Use --lora-assignment to control selection strategy.", + ) + + parser.add_argument( + "--lora-assignment", + type=str, + default="random", + choices=["random", "round-robin"], + help="Strategy for assigning LoRA modules to requests. " + "'random' (default) selects a LoRA at random for each request. " + "'round-robin' cycles through LoRA modules deterministically.", ) parser.add_argument( @@ -1788,6 +1810,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, + lora_assignment=args.lora_assignment, extra_headers=headers, extra_body=extra_body, ramp_up_strategy=args.ramp_up_strategy, diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index ad6f44404..1af8cf900 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -350,6 +350,7 @@ def get_requests(args, tokenizer): "tokenizer": tokenizer, "lora_path": args.lora_path, "max_loras": args.max_loras, + "lora_assignment": getattr(args, "lora_assignment", "random"), "num_requests": args.num_prompts, } @@ -778,6 +779,15 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Path to the lora adapters to use. This can be an absolute path, " "a relative path, or a Hugging Face model identifier.", ) + parser.add_argument( + "--lora-assignment", + type=str, + default="random", + choices=["random", "round-robin"], + help="Strategy for assigning LoRA adapters to requests. " + "'random' (default) selects a LoRA at random for each request. " + "'round-robin' cycles through LoRAs deterministically.", + ) parser.add_argument( "--prefix-len", type=int,