Adding deterministic lora benchmarking to vLLM Bench (#36057)
Signed-off-by: Ubuntu <ubuntu@ip-172-31-43-201.ap-northeast-1.compute.internal> Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
This commit is contained in:
@@ -183,6 +183,68 @@ class BenchmarkDataset(ABC):
|
||||
)
|
||||
return lora_request
|
||||
|
||||
def get_round_robin_lora_request(
|
||||
self,
|
||||
index: int,
|
||||
max_loras: int | None = None,
|
||||
lora_path: str | None = None,
|
||||
) -> LoRARequest | None:
|
||||
"""
|
||||
Optionally select a LoRA request using deterministic round-robin.
|
||||
|
||||
This method cycles through LoRA IDs in order based on the request
|
||||
index, providing reproducible LoRA assignment.
|
||||
|
||||
Args:
|
||||
index (int): The request index used for round-robin selection.
|
||||
max_loras (Optional[int]): The maximum number of LoRAs available.
|
||||
If `None`, LoRA is not used.
|
||||
lora_path (Optional[str]): Path to the LoRA parameters on disk.
|
||||
If `None`, LoRA is not used.
|
||||
|
||||
Returns:
|
||||
A new [`LoRARequest`][vllm.lora.request.LoRARequest]
|
||||
(or `None` if not applicable).
|
||||
"""
|
||||
if max_loras is None or lora_path is None:
|
||||
return None
|
||||
|
||||
# Deterministic round-robin: cycle through [1, max_loras]
|
||||
lora_id = index % max_loras + 1
|
||||
lora_request = LoRARequest(
|
||||
lora_name=str(lora_id),
|
||||
lora_int_id=lora_id,
|
||||
lora_path=lora_path_on_disk(lora_path),
|
||||
)
|
||||
return lora_request
|
||||
|
||||
def get_lora_request(
|
||||
self,
|
||||
index: int,
|
||||
max_loras: int | None = None,
|
||||
lora_path: str | None = None,
|
||||
lora_assignment: str = "random",
|
||||
) -> LoRARequest | None:
|
||||
"""
|
||||
Select a LoRA request using the specified assignment strategy.
|
||||
|
||||
Args:
|
||||
index (int): The request index (used for round-robin).
|
||||
max_loras (Optional[int]): The maximum number of LoRAs available.
|
||||
lora_path (Optional[str]): Path to the LoRA parameters on disk.
|
||||
lora_assignment (str): Strategy for LoRA selection.
|
||||
'random' (default) or 'round-robin'.
|
||||
|
||||
Returns:
|
||||
A new [`LoRARequest`][vllm.lora.request.LoRARequest]
|
||||
(or `None` if not applicable).
|
||||
"""
|
||||
if lora_assignment == "round-robin":
|
||||
return self.get_round_robin_lora_request(
|
||||
index=index, max_loras=max_loras, lora_path=lora_path
|
||||
)
|
||||
return self.get_random_lora_request(max_loras=max_loras, lora_path=lora_path)
|
||||
|
||||
@abstractmethod
|
||||
def sample(
|
||||
self,
|
||||
@@ -478,6 +540,9 @@ class RandomDataset(BenchmarkDataset):
|
||||
input_len: int = DEFAULT_INPUT_LEN,
|
||||
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||
batchsize: int = 1,
|
||||
max_loras: int | None = None,
|
||||
lora_path: str | None = None,
|
||||
lora_assignment: str = "random",
|
||||
**kwargs,
|
||||
) -> list[SampleRequest]:
|
||||
# validate total input tokens (prefix + sampled) is at least 1.
|
||||
@@ -522,11 +587,18 @@ class RandomDataset(BenchmarkDataset):
|
||||
allowed_tokens=allowed_tokens,
|
||||
)
|
||||
token_mismatch_total += token_mismatch
|
||||
lora_req = self.get_lora_request(
|
||||
index=i,
|
||||
max_loras=max_loras,
|
||||
lora_path=lora_path,
|
||||
lora_assignment=lora_assignment,
|
||||
)
|
||||
requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=total_input_len,
|
||||
expected_output_len=int(output_lens[i]),
|
||||
lora_request=lora_req,
|
||||
request_id=request_id_prefix + str(i),
|
||||
)
|
||||
)
|
||||
@@ -1263,6 +1335,7 @@ class ShareGPTDataset(BenchmarkDataset):
|
||||
enable_multimodal_chat: bool = False,
|
||||
request_id_prefix: str = "",
|
||||
no_oversample: bool = False,
|
||||
lora_assignment: str = "random",
|
||||
**kwargs,
|
||||
) -> list:
|
||||
samples: list = []
|
||||
@@ -1275,8 +1348,11 @@ class ShareGPTDataset(BenchmarkDataset):
|
||||
entry["conversations"][1]["value"],
|
||||
)
|
||||
|
||||
lora_request = self.get_random_lora_request(
|
||||
max_loras=max_loras, lora_path=lora_path
|
||||
lora_request = self.get_lora_request(
|
||||
index=ind,
|
||||
max_loras=max_loras,
|
||||
lora_path=lora_path,
|
||||
lora_assignment=lora_assignment,
|
||||
)
|
||||
prompt_ids = tokenizer(prompt).input_ids
|
||||
completion_ids = tokenizer(completion).input_ids
|
||||
@@ -2413,6 +2489,7 @@ class BurstGPTDataset(BenchmarkDataset):
|
||||
lora_path: str | None = None,
|
||||
request_id_prefix: str = "",
|
||||
no_oversample: bool = False,
|
||||
lora_assignment: str = "random",
|
||||
**kwargs,
|
||||
) -> list[SampleRequest]:
|
||||
samples = []
|
||||
@@ -2420,8 +2497,11 @@ class BurstGPTDataset(BenchmarkDataset):
|
||||
for i in range(num_requests):
|
||||
input_len = int(data[i][2])
|
||||
output_len = int(data[i][3])
|
||||
lora_req = self.get_random_lora_request(
|
||||
max_loras=max_loras, lora_path=lora_path
|
||||
lora_req = self.get_lora_request(
|
||||
index=i,
|
||||
max_loras=max_loras,
|
||||
lora_path=lora_path,
|
||||
lora_assignment=lora_assignment,
|
||||
)
|
||||
vocab_size = tokenizer.vocab_size
|
||||
# Generate a synthetic prompt: a list of token IDs computed as (i +
|
||||
|
||||
@@ -624,6 +624,7 @@ async def benchmark(
|
||||
lora_modules: Iterable[str] | None,
|
||||
extra_headers: dict | None,
|
||||
extra_body: dict | None,
|
||||
lora_assignment: Literal["random", "round-robin"] = "random",
|
||||
ramp_up_strategy: Literal["linear", "exponential"] | None = None,
|
||||
ramp_up_start_rps: int | None = None,
|
||||
ramp_up_end_rps: int | None = None,
|
||||
@@ -731,10 +732,20 @@ async def benchmark(
|
||||
print("Starting main benchmark run...")
|
||||
|
||||
if lora_modules:
|
||||
# For each input request, choose a LoRA module at random.
|
||||
lora_modules = iter(
|
||||
[random.choice(lora_modules) for _ in range(len(input_requests))]
|
||||
)
|
||||
lora_modules_list = list(lora_modules)
|
||||
if lora_assignment == "round-robin":
|
||||
# Deterministic round-robin assignment across requests.
|
||||
lora_modules = iter(
|
||||
[
|
||||
lora_modules_list[i % len(lora_modules_list)]
|
||||
for i in range(len(input_requests))
|
||||
]
|
||||
)
|
||||
else:
|
||||
# For each input request, choose a LoRA module at random.
|
||||
lora_modules = iter(
|
||||
[random.choice(lora_modules_list) for _ in range(len(input_requests))]
|
||||
)
|
||||
|
||||
if profile:
|
||||
print("Starting profiler...")
|
||||
@@ -1523,7 +1534,18 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
default=None,
|
||||
help="A subset of LoRA module names passed in when "
|
||||
"launching the server. For each request, the "
|
||||
"script chooses a LoRA module at random.",
|
||||
"script chooses a LoRA module at random by default. "
|
||||
"Use --lora-assignment to control selection strategy.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--lora-assignment",
|
||||
type=str,
|
||||
default="random",
|
||||
choices=["random", "round-robin"],
|
||||
help="Strategy for assigning LoRA modules to requests. "
|
||||
"'random' (default) selects a LoRA at random for each request. "
|
||||
"'round-robin' cycles through LoRA modules deterministically.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@@ -1788,6 +1810,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
max_concurrency=args.max_concurrency,
|
||||
lora_modules=args.lora_modules,
|
||||
lora_assignment=args.lora_assignment,
|
||||
extra_headers=headers,
|
||||
extra_body=extra_body,
|
||||
ramp_up_strategy=args.ramp_up_strategy,
|
||||
|
||||
@@ -350,6 +350,7 @@ def get_requests(args, tokenizer):
|
||||
"tokenizer": tokenizer,
|
||||
"lora_path": args.lora_path,
|
||||
"max_loras": args.max_loras,
|
||||
"lora_assignment": getattr(args, "lora_assignment", "random"),
|
||||
"num_requests": args.num_prompts,
|
||||
}
|
||||
|
||||
@@ -778,6 +779,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
help="Path to the lora adapters to use. This can be an absolute path, "
|
||||
"a relative path, or a Hugging Face model identifier.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lora-assignment",
|
||||
type=str,
|
||||
default="random",
|
||||
choices=["random", "round-robin"],
|
||||
help="Strategy for assigning LoRA adapters to requests. "
|
||||
"'random' (default) selects a LoRA at random for each request. "
|
||||
"'round-robin' cycles through LoRAs deterministically.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefix-len",
|
||||
type=int,
|
||||
|
||||
Reference in New Issue
Block a user