Adding deterministic lora benchmarking to vLLM Bench (#36057)

Signed-off-by: Ubuntu <ubuntu@ip-172-31-43-201.ap-northeast-1.compute.internal>
Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
This commit is contained in:
RonaldBXu
2026-03-18 09:02:03 -07:00
committed by GitHub
parent b1169d7be8
commit c9d838fc33
3 changed files with 122 additions and 9 deletions

View File

@@ -183,6 +183,68 @@ class BenchmarkDataset(ABC):
)
return lora_request
def get_round_robin_lora_request(
self,
index: int,
max_loras: int | None = None,
lora_path: str | None = None,
) -> LoRARequest | None:
"""
Optionally select a LoRA request using deterministic round-robin.
This method cycles through LoRA IDs in order based on the request
index, providing reproducible LoRA assignment.
Args:
index (int): The request index used for round-robin selection.
max_loras (Optional[int]): The maximum number of LoRAs available.
If `None`, LoRA is not used.
lora_path (Optional[str]): Path to the LoRA parameters on disk.
If `None`, LoRA is not used.
Returns:
A new [`LoRARequest`][vllm.lora.request.LoRARequest]
(or `None` if not applicable).
"""
if max_loras is None or lora_path is None:
return None
# Deterministic round-robin: cycle through [1, max_loras]
lora_id = index % max_loras + 1
lora_request = LoRARequest(
lora_name=str(lora_id),
lora_int_id=lora_id,
lora_path=lora_path_on_disk(lora_path),
)
return lora_request
def get_lora_request(
self,
index: int,
max_loras: int | None = None,
lora_path: str | None = None,
lora_assignment: str = "random",
) -> LoRARequest | None:
"""
Select a LoRA request using the specified assignment strategy.
Args:
index (int): The request index (used for round-robin).
max_loras (Optional[int]): The maximum number of LoRAs available.
lora_path (Optional[str]): Path to the LoRA parameters on disk.
lora_assignment (str): Strategy for LoRA selection.
'random' (default) or 'round-robin'.
Returns:
A new [`LoRARequest`][vllm.lora.request.LoRARequest]
(or `None` if not applicable).
"""
if lora_assignment == "round-robin":
return self.get_round_robin_lora_request(
index=index, max_loras=max_loras, lora_path=lora_path
)
return self.get_random_lora_request(max_loras=max_loras, lora_path=lora_path)
@abstractmethod
def sample(
self,
@@ -478,6 +540,9 @@ class RandomDataset(BenchmarkDataset):
input_len: int = DEFAULT_INPUT_LEN,
output_len: int = DEFAULT_OUTPUT_LEN,
batchsize: int = 1,
max_loras: int | None = None,
lora_path: str | None = None,
lora_assignment: str = "random",
**kwargs,
) -> list[SampleRequest]:
# validate total input tokens (prefix + sampled) is at least 1.
@@ -522,11 +587,18 @@ class RandomDataset(BenchmarkDataset):
allowed_tokens=allowed_tokens,
)
token_mismatch_total += token_mismatch
lora_req = self.get_lora_request(
index=i,
max_loras=max_loras,
lora_path=lora_path,
lora_assignment=lora_assignment,
)
requests.append(
SampleRequest(
prompt=prompt,
prompt_len=total_input_len,
expected_output_len=int(output_lens[i]),
lora_request=lora_req,
request_id=request_id_prefix + str(i),
)
)
@@ -1263,6 +1335,7 @@ class ShareGPTDataset(BenchmarkDataset):
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
lora_assignment: str = "random",
**kwargs,
) -> list:
samples: list = []
@@ -1275,8 +1348,11 @@ class ShareGPTDataset(BenchmarkDataset):
entry["conversations"][1]["value"],
)
lora_request = self.get_random_lora_request(
max_loras=max_loras, lora_path=lora_path
lora_request = self.get_lora_request(
index=ind,
max_loras=max_loras,
lora_path=lora_path,
lora_assignment=lora_assignment,
)
prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
@@ -2413,6 +2489,7 @@ class BurstGPTDataset(BenchmarkDataset):
lora_path: str | None = None,
request_id_prefix: str = "",
no_oversample: bool = False,
lora_assignment: str = "random",
**kwargs,
) -> list[SampleRequest]:
samples = []
@@ -2420,8 +2497,11 @@ class BurstGPTDataset(BenchmarkDataset):
for i in range(num_requests):
input_len = int(data[i][2])
output_len = int(data[i][3])
lora_req = self.get_random_lora_request(
max_loras=max_loras, lora_path=lora_path
lora_req = self.get_lora_request(
index=i,
max_loras=max_loras,
lora_path=lora_path,
lora_assignment=lora_assignment,
)
vocab_size = tokenizer.vocab_size
# Generate a synthetic prompt: a list of token IDs computed as (i +

View File

@@ -624,6 +624,7 @@ async def benchmark(
lora_modules: Iterable[str] | None,
extra_headers: dict | None,
extra_body: dict | None,
lora_assignment: Literal["random", "round-robin"] = "random",
ramp_up_strategy: Literal["linear", "exponential"] | None = None,
ramp_up_start_rps: int | None = None,
ramp_up_end_rps: int | None = None,
@@ -731,10 +732,20 @@ async def benchmark(
print("Starting main benchmark run...")
if lora_modules:
# For each input request, choose a LoRA module at random.
lora_modules = iter(
[random.choice(lora_modules) for _ in range(len(input_requests))]
)
lora_modules_list = list(lora_modules)
if lora_assignment == "round-robin":
# Deterministic round-robin assignment across requests.
lora_modules = iter(
[
lora_modules_list[i % len(lora_modules_list)]
for i in range(len(input_requests))
]
)
else:
# For each input request, choose a LoRA module at random.
lora_modules = iter(
[random.choice(lora_modules_list) for _ in range(len(input_requests))]
)
if profile:
print("Starting profiler...")
@@ -1523,7 +1534,18 @@ def add_cli_args(parser: argparse.ArgumentParser):
default=None,
help="A subset of LoRA module names passed in when "
"launching the server. For each request, the "
"script chooses a LoRA module at random.",
"script chooses a LoRA module at random by default. "
"Use --lora-assignment to control selection strategy.",
)
parser.add_argument(
"--lora-assignment",
type=str,
default="random",
choices=["random", "round-robin"],
help="Strategy for assigning LoRA modules to requests. "
"'random' (default) selects a LoRA at random for each request. "
"'round-robin' cycles through LoRA modules deterministically.",
)
parser.add_argument(
@@ -1788,6 +1810,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
lora_assignment=args.lora_assignment,
extra_headers=headers,
extra_body=extra_body,
ramp_up_strategy=args.ramp_up_strategy,

View File

@@ -350,6 +350,7 @@ def get_requests(args, tokenizer):
"tokenizer": tokenizer,
"lora_path": args.lora_path,
"max_loras": args.max_loras,
"lora_assignment": getattr(args, "lora_assignment", "random"),
"num_requests": args.num_prompts,
}
@@ -778,6 +779,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.",
)
parser.add_argument(
"--lora-assignment",
type=str,
default="random",
choices=["random", "round-robin"],
help="Strategy for assigning LoRA adapters to requests. "
"'random' (default) selects a LoRA at random for each request. "
"'round-robin' cycles through LoRAs deterministically.",
)
parser.add_argument(
"--prefix-len",
type=int,