[Kernels] LoRA - Retire SGMV and BGMV Kernels (#14685)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
Varun Sundar Rabindranath
2025-03-18 05:47:53 -04:00
committed by simon-mo
parent 16e9064f84
commit 9e8f089d08
15 changed files with 245 additions and 2092 deletions

View File

@@ -17,13 +17,8 @@ from torch.utils.benchmark import Measurement as TMeasurement
from utils import ArgPool, Bench, CudaGraphBenchParams
from weight_shapes import WEIGHT_SHAPES
from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -167,69 +162,25 @@ class OpType(Enum):
"""
LoRA Ops to benchmark and its properties.
"""
SGMV_SHRINK = auto()
BGMV_SHRINK = auto()
SGMV_EXPAND = auto()
BGMV_EXPAND = auto()
BGMV_EXPAND_SLICE = auto()
V1_SHRINK = auto()
V1_EXPAND = auto()
LORA_SHRINK = auto()
LORA_EXPAND = auto()
@staticmethod
def from_str(s: str) -> "OpType":
if s.lower() == 'sgmv_shrink':
return OpType.SGMV_SHRINK
if s.lower() == 'sgmv_expand':
return OpType.SGMV_EXPAND
if s.lower() == 'bgmv_shrink':
return OpType.BGMV_SHRINK
if s.lower() == 'bgmv_expand':
return OpType.BGMV_EXPAND
if s.lower() == "bgmv_expand_slice":
return OpType.BGMV_EXPAND_SLICE
if s.lower() == "v1_shrink":
return OpType.V1_SHRINK
if s.lower() == "v1_expand":
return OpType.V1_EXPAND
if s.lower() == "lora_shrink":
return OpType.LORA_SHRINK
if s.lower() == "lora_expand":
return OpType.LORA_EXPAND
raise ValueError(f"Unrecognized str {s} to convert to OpType")
def is_shrink_fn(self) -> bool:
return self in [
OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
]
return self in [OpType.LORA_SHRINK]
def is_expand_fn(self) -> bool:
return self in [
OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
]
def is_prefill_op(self) -> bool:
return self in [
OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
OpType.V1_EXPAND
]
def is_decode_op(self) -> bool:
return self in [
OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
OpType.V1_SHRINK, OpType.V1_EXPAND
]
def is_expand_slice_fn(self) -> bool:
return self in [OpType.BGMV_EXPAND_SLICE]
return self in [OpType.LORA_EXPAND]
def num_slices(self) -> list[int]:
if self in [
OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
OpType.V1_EXPAND
]:
# SGMV kernels and v1 kernels supports slices
return [1, 2, 3]
if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
return [1]
if self in [OpType.BGMV_EXPAND_SLICE]:
return [2, 3]
raise ValueError(f"Unrecognized OpType {self}")
return [1, 2, 3]
def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
lora_rank: int) -> tuple[int, int, int]:
@@ -239,7 +190,7 @@ class OpType(Enum):
k = hidden_size
n = lora_rank
else:
assert self.is_expand_fn() or self.is_expand_slice_fn()
assert self.is_expand_fn()
m = num_tokens
k = lora_rank
n = hidden_size
@@ -254,7 +205,7 @@ class OpType(Enum):
if self.is_shrink_fn():
return op_dtype, op_dtype, torch.float32
else:
assert self.is_expand_fn() or self.is_expand_slice_fn()
assert self.is_expand_fn()
return torch.float32, op_dtype, op_dtype
def matmul_shapes(
@@ -268,43 +219,19 @@ class OpType(Enum):
m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
b_shape = (num_loras, n, k) # col-major
if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
# SGMV shrink and V1 shrink kernels support num_slices inherently
# in the kernel.
if self in [OpType.LORA_SHRINK]:
# LoRA shrink kernels support num_slices inherently in the kernel.
return ((m, k), b_shape, (num_slices, m, n))
if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
# SGMV expand and V1 expand kernels support num_slices inherently
# in the kernel
if self in [OpType.LORA_EXPAND]:
# LoRA expand kernels support num_slices inherently in the kernel
return ((num_slices, m, k), b_shape, (m, n * num_slices))
if self == OpType.BGMV_SHRINK:
return ((m, k), b_shape, (m, n))
if self == OpType.BGMV_EXPAND:
return ((m, k), b_shape, (m, n))
if self == OpType.BGMV_EXPAND_SLICE:
return ((num_slices, m, k), b_shape, (m, n * num_slices))
raise ValueError(f"Unrecognized op_type {self}")
def bench_fn(self) -> Callable:
def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
for x in kwargs_list:
bgmv_expand_slice(**x)
if self == OpType.SGMV_SHRINK:
return sgmv_shrink
if self == OpType.SGMV_EXPAND:
return sgmv_expand
if self == OpType.BGMV_SHRINK:
return bgmv_shrink
if self == OpType.BGMV_EXPAND:
return bgmv_expand
if self == OpType.BGMV_EXPAND_SLICE:
return emulate_bgmv_expand_slice
if self == OpType.V1_SHRINK:
return v1_shrink
if self == OpType.V1_EXPAND:
return v1_expand
if self == OpType.LORA_SHRINK:
return lora_shrink
if self == OpType.LORA_EXPAND:
return lora_expand
raise ValueError(f"Unrecognized optype {self}")
@@ -318,34 +245,13 @@ class OpType(Enum):
"""
w_dtype = lora_weights[0].dtype
num_slices = len(lora_weights)
if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
if self in [OpType.LORA_SHRINK]:
for slice_idx in range(num_slices):
ref_group_gemm(ref_out=output[slice_idx, :],
input=input,
lora_weights=lora_weights[slice_idx],
**kwargs)
elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
hidden_size = lora_weights[0].shape[1]
for slice_idx in range(num_slices):
slice_offset = slice_idx * hidden_size
ref_group_gemm(
ref_out=output[:, slice_offset:slice_offset + hidden_size],
input=input[slice_idx].clone().to(dtype=w_dtype),
lora_weights=lora_weights[slice_idx],
**kwargs)
elif self == OpType.BGMV_SHRINK:
assert num_slices == 1
ref_group_gemm(ref_out=output,
input=input,
lora_weights=lora_weights[0],
**kwargs)
elif self == OpType.BGMV_EXPAND:
assert num_slices == 1
ref_group_gemm(ref_out=output,
input=input.clone().to(dtype=w_dtype),
lora_weights=lora_weights[0],
**kwargs)
elif self == OpType.BGMV_EXPAND_SLICE:
elif self in [OpType.LORA_EXPAND]:
hidden_size = lora_weights[0].shape[1]
for slice_idx in range(num_slices):
slice_offset = slice_idx * hidden_size
@@ -411,13 +317,11 @@ class BenchmarkTensors:
input: torch.Tensor
lora_weights_lst: list[torch.Tensor]
output: torch.Tensor
# metadata tensors
# LoRA kernel metadata
lora_kernel_meta: LoRAKernelMeta
# Metadata tensors used in testing correctness
seq_lens: torch.Tensor
seq_start_loc: torch.Tensor
prompt_lora_mapping: torch.Tensor
token_lora_mapping: torch.Tensor
# v1 kernel metadata
v1_kernel_meta: Optional[V1KernelMeta] = None
def io_types(self) -> str:
return (f"{dtype_to_str(self.input.dtype)}x"
@@ -444,35 +348,29 @@ class BenchmarkTensors:
assert ctx.num_active_loras <= ctx.num_loras
total_tokens = ctx.batch_size * ctx.seq_length
# Make metadata tensors involved in correctness testing.
# Prepare seq lens tensor
seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
(ctx.batch_size, ))
# Prepare seq_start_loc tensor
seq_start_loc_tensor = torch.cumsum(torch.tensor(
[0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
dim=0)
assert total_tokens == seq_len_tensor.sum()
# Prepare prompt lora indices tensor
prompt_lora_indices_tensor = make_prompt_lora_mapping(
ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
# Prepare token lora indices tensor
# Make LoRAKernelMeta
token_lora_indices_tensor = make_token_lora_mapping(
total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
seq_len_tensor, "cpu")
v1_kernel_meta = None
if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
v1_kernel_meta = V1KernelMeta.make(
max_loras=ctx.num_loras,
max_num_tokens=token_lora_indices_tensor.size(0),
device="cpu")
v1_kernel_meta.prepare_tensors(
token_lora_mapping=token_lora_indices_tensor)
lora_kernel_meta = LoRAKernelMeta.make(
max_loras=ctx.num_loras,
max_num_tokens=token_lora_indices_tensor.size(0),
device="cpu")
lora_kernel_meta.prepare_tensors(
token_lora_mapping=token_lora_indices_tensor)
return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
seq_len_tensor, seq_start_loc_tensor,
prompt_lora_indices_tensor,
token_lora_indices_tensor, v1_kernel_meta)
lora_kernel_meta, seq_len_tensor,
prompt_lora_indices_tensor)
def sanity_check(self) -> None:
"""
@@ -482,9 +380,9 @@ class BenchmarkTensors:
# check metadata tensors
assert torch.sum(self.seq_lens) == num_tokens
num_seqs = self.seq_lens.shape[0]
assert self.seq_start_loc.shape[0] == num_seqs
#assert self.seq_start_loc.shape[0] == num_seqs
assert self.prompt_lora_mapping.shape[0] == num_seqs
assert self.token_lora_mapping.shape[0] == num_tokens
assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
def to_device(self, device: str):
"""
@@ -499,220 +397,27 @@ class BenchmarkTensors:
self.input = to_device(self.input)
self.output = to_device(self.output)
self.seq_lens = to_device(self.seq_lens)
self.seq_start_loc = to_device(self.seq_start_loc)
self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
self.token_lora_mapping = to_device(self.token_lora_mapping)
for i in range(len(self.lora_weights_lst)):
self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
# v1 meta
if self.v1_kernel_meta:
for field_name in V1KernelMeta.__dataclass_fields__:
field = getattr(self.v1_kernel_meta, field_name)
assert isinstance(field, torch.Tensor)
setattr(self.v1_kernel_meta, field_name, to_device(field))
# LoRA meta
for field_name in LoRAKernelMeta.__dataclass_fields__:
field = getattr(self.lora_kernel_meta, field_name)
assert isinstance(field, torch.Tensor)
setattr(self.lora_kernel_meta, field_name, to_device(field))
def metadata(self) -> tuple[int, int, int]:
"""
Return num_seqs, num_tokens and max_seq_len
"""
num_seqs = self.seq_lens.shape[0]
num_tokens = self.token_lora_mapping.shape[0]
num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
max_seq_len = torch.max(self.seq_lens).item()
num_slices = len(self.lora_weights_lst)
return num_seqs, num_tokens, max_seq_len, num_slices
def convert_to_sgmv_benchmark_tensors(self):
"""
For sgmv punica kernels, when consecutive sequences have the
same LoRA ID, we just merge them together.
This happens in punica.py::compute_metadata
"""
# Collapse seq_lens and seq_start_loc
_, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
return_counts=True)
cum_result = torch.cumsum(seq_lens, dim=0)
seq_start_loc = torch.zeros_like(seq_lens)
seq_start_loc[1:].copy_(cum_result[:-1])
# Collapse prompt mapping
prompt_lora_mapping = torch.unique_consecutive(
self.prompt_lora_mapping)
assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
self.prompt_lora_mapping = prompt_lora_mapping.to(
dtype=self.prompt_lora_mapping.dtype)
self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
self.convert_to_sgmv_benchmark_tensors()
self.sanity_check()
self.to_device(self.input.device)
num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
# Sanity check matrix shapes.
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape [num_tokens, hidden_size]
assert len(i_shape) == 2
assert i_shape[0] == num_tokens
hidden_size = i_shape[1]
# Expected lora weight shape [num_loras, lora_rank, hidden_size]
assert len(lw_shape) == 3
assert lw_shape[2] == hidden_size
lora_rank = lw_shape[1]
# Expected output shape [num_slices, num_tokens, lora_rank]
assert len(o_shape) == 3
assert o_shape == (num_slices, num_tokens, lora_rank)
return {
'inputs': self.input,
'lora_a_weights': self.lora_weights_lst,
'output_tensor': self.output,
'b_seq_start_loc': self.seq_start_loc,
'seq_len_tensor': self.seq_lens,
'lora_indices_tensor': self.prompt_lora_mapping,
'batches': num_seqs,
'max_seq_length': max_seq_len,
'token_nums': num_tokens,
'scaling': 1.0,
}
def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
self.convert_to_sgmv_benchmark_tensors()
self.sanity_check()
self.to_device(self.input.device)
num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
# Sanity check matrix shapes.
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape : [num_slices, num_tokens, lora_rank]
assert len(i_shape) == 3
assert i_shape[0] == num_slices
assert i_shape[1] == num_tokens
lora_rank = i_shape[2]
# Expected lora weight shape : [num_lora, hidden_size, lora_rank]
assert len(lw_shape) == 3
assert lw_shape[2] == lora_rank
hidden_size = lw_shape[1]
# Expected output shape : [num_tokens, hidden_size * num_slices]
assert len(o_shape) == 2
assert o_shape == (num_tokens, hidden_size * num_slices)
return {
'inputs': self.input,
'lora_b_weights': self.lora_weights_lst,
'output_tensor': self.output,
'b_seq_start_loc': self.seq_start_loc,
'seq_len_tensor': self.seq_lens,
'lora_indices_tensor': self.prompt_lora_mapping,
'batches': num_seqs,
'max_seq_length': max_seq_len,
'token_nums': num_tokens,
'offset_start': 0,
'add_inputs': add_inputs,
}
def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
assert len(self.lora_weights_lst) == 1
self.to_device(self.input.device)
_, num_tokens, _, _ = self.metadata()
# Sanity check shapes
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape [num_tokens, hidden_size]
assert len(i_shape) == 2
assert i_shape[0] == num_tokens
hidden_size = i_shape[1]
# Expected lora weight shape [num_loras, lora_rank, hidden_size]
assert len(lw_shape) == 3
assert lw_shape[2] == hidden_size
lora_rank = lw_shape[1]
# Expected output shape [num_tokens, lora_rank]
assert len(o_shape) == 2
assert o_shape == (num_tokens, lora_rank)
return {
'inputs': self.input,
'lora_a_weights': self.lora_weights_lst[0],
'output_tensor': self.output,
'lora_indices_tensor': self.token_lora_mapping,
'scaling': 1.0
}
def as_bgmv_expand_kwargs(self, add_inputs: bool):
assert len(self.lora_weights_lst) == 1
self.to_device(self.input.device)
_, num_tokens, _, _ = self.metadata()
# Sanity check shapes
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape [num_tokens, lora_rank]
assert len(i_shape) == 2
assert i_shape[0] == num_tokens
lora_rank = i_shape[1]
# Expected lora weight shape [num_loras, hidden_size, lora_rank]
assert len(lw_shape) == 3
assert lw_shape[2] == lora_rank
hidden_size = lw_shape[1]
# Expected output shape [num_tokens, hidden_size]
assert len(o_shape) == 2
assert o_shape == (num_tokens, hidden_size)
return {
'inputs': self.input,
'lora_b_weights': self.lora_weights_lst[0],
'output_tensor': self.output,
'lora_indices_tensor': self.token_lora_mapping,
'add_inputs': add_inputs
}
def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
_, num_tokens, _, num_slices = self.metadata()
# Sanity check shapes
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape [num_slices, num_tokens, lora_rank]
assert len(i_shape) == 3
assert i_shape[0] == num_slices
assert i_shape[1] == num_tokens
lora_rank = i_shape[2]
# Expected lora weight shape [num_loras, hidden_size, lora_rank]
assert len(lw_shape) == 3
assert lw_shape[2] == lora_rank
hidden_size = lw_shape[1]
# Expected output shape [num_tokens, hidden_size * num_slices]
assert len(o_shape) == 2
assert o_shape == (num_tokens, hidden_size * num_slices)
self.to_device(self.input.device)
kwargs_list = []
for i in range(num_slices):
kwargs_list.append({
'inputs': self.input[i],
'lora_b_weights': self.lora_weights_lst[i],
'output_tensor': self.output,
'lora_indices_tensor': self.token_lora_mapping,
'slice_offset': i * hidden_size,
'slice_size': hidden_size,
'add_inputs': add_inputs,
})
return {'kwargs_list': kwargs_list}
def as_v1_shrink_kwargs(self) -> dict[str, Any]:
assert self.v1_kernel_meta is not None
def as_lora_shrink_kwargs(self) -> dict[str, Any]:
self.sanity_check()
self.to_device(self.input.device)
@@ -737,17 +442,16 @@ class BenchmarkTensors:
'inputs': self.input,
'lora_a_weights': self.lora_weights_lst,
'output_tensor': self.output,
'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
'token_indices_sorted_by_lora_ids':
self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
'lora_ids': self.v1_kernel_meta.active_lora_ids,
self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
'lora_ids': self.lora_kernel_meta.active_lora_ids,
'scaling': 1.0,
}
def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
assert self.v1_kernel_meta is not None
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
self.sanity_check()
self.to_device(self.input.device)
@@ -773,12 +477,12 @@ class BenchmarkTensors:
'inputs': self.input,
'lora_b_weights': self.lora_weights_lst,
'output_tensor': self.output,
'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
'token_indices_sorted_by_lora_ids':
self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
'lora_ids': self.v1_kernel_meta.active_lora_ids,
self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
'lora_ids': self.lora_kernel_meta.active_lora_ids,
'offset_start': 0,
'add_inputs': add_inputs,
}
@@ -791,20 +495,10 @@ class BenchmarkTensors:
else:
assert add_inputs is not None
if op_type == OpType.SGMV_SHRINK:
return self.as_sgmv_shrink_kwargs()
if op_type == OpType.SGMV_EXPAND:
return self.as_sgmv_expand_kwargs(add_inputs)
if op_type == OpType.BGMV_SHRINK:
return self.as_bgmv_shrink_kwargs()
if op_type == OpType.BGMV_EXPAND:
return self.as_bgmv_expand_kwargs(add_inputs)
if op_type == OpType.BGMV_EXPAND_SLICE:
return self.as_bgmv_expand_slice_kwargs(add_inputs)
if op_type == OpType.V1_SHRINK:
return self.as_v1_shrink_kwargs()
if op_type == OpType.V1_EXPAND:
return self.as_v1_expand_kwargs(add_inputs)
if op_type == OpType.LORA_SHRINK:
return self.as_lora_shrink_kwargs()
if op_type == OpType.LORA_EXPAND:
return self.as_lora_expand_kwargs(add_inputs)
raise ValueError(f"Unrecognized optype {self}")
def test_correctness(self, op_type: OpType,
@@ -993,10 +687,6 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
for bench_ctx in bench_ctxs:
for seq_len in args.seq_lengths:
bench_ops: list[OpType] = args.op_types
if seq_len > 1:
# bench only prefill ops
bench_ops = [op for op in args.op_types if op.is_prefill_op()]
seq_len_timers = []
for bench_op in bench_ops:
for num_slices in bench_op.num_slices():
@@ -1206,13 +896,13 @@ Benchmark LoRA kernels:
{use_cuda_graph_recommendation()}
list_bench example:
python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
model_bench example:
python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
range_bench example:
python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8
python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8
""", # noqa: E501
formatter_class=argparse.RawTextHelpFormatter)