[Feature] Add visionarena offline support for benchmark_throughput (#14654)

Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Jennifer Zhao <JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
Jennifer Zhao
2025-03-13 21:07:54 -07:00
committed by GitHub
parent d3d4956261
commit a6e0d096dd
3 changed files with 291 additions and 99 deletions

View File

@@ -46,7 +46,7 @@ class SampleRequest:
Represents a single inference request for benchmarking.
"""
prompt: str
prompt: Union[str, Any]
prompt_len: int
expected_output_len: int
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
@@ -84,6 +84,20 @@ class BenchmarkDataset(ABC):
if random_seed is not None else self.DEFAULT_SEED)
self.data = None
def apply_multimodal_chat_transformation(
self,
prompt: str,
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
"""
Transform a prompt and optional multimodal content into a chat format.
This method is used for chat models that expect a specific
conversation format.
"""
content = [{"text": prompt, "type": "text"}]
if mm_content is not None:
content.append(mm_content)
return [{"role": "user", "content": content}]
def load_data(self) -> None:
"""
Load data from the dataset path into self.data.
@@ -338,6 +352,7 @@ class ShareGPTDataset(BenchmarkDataset):
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
samples: list = []
for entry in self.data:
@@ -358,6 +373,9 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check=output_len
is not None):
continue
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append(
SampleRequest(
prompt=prompt,
@@ -550,10 +568,13 @@ class HuggingFaceDataset(BenchmarkDataset):
split=self.dataset_split,
streaming=True,
)
if "conversations" not in self.data.features:
raise ValueError("HF Dataset must have a 'conversations' column.")
if self.data.features is None or "conversations" \
not in self.data.features:
raise ValueError(
"HuggingFaceDataset currently only supports datasets with "
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
"Please consider contributing if you would like to add "
"support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations.
self.data = self.data.shuffle(seed=self.random_seed).filter(
lambda x: len(x["conversations"]) >= 2)
@@ -561,9 +582,8 @@ class HuggingFaceDataset(BenchmarkDataset):
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
sampled_requests = []
dynamic_output = output_len is None
@@ -571,13 +591,9 @@ class HuggingFaceDataset(BenchmarkDataset):
for item in self.data:
if len(sampled_requests) >= num_requests:
break
conv = item["conversations"]
prompt, completion = conv[0]["value"], conv[1]["value"]
lora_request, tokenizer = self.get_random_lora_request(
tokenizer, lora_path=lora_path, max_loras=max_loras)
prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids)
@@ -587,16 +603,20 @@ class HuggingFaceDataset(BenchmarkDataset):
if dynamic_output and not is_valid_sequence(
prompt_len, completion_len):
continue
mm_content = process_image(
item["image"]) if "image" in item else None
if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len and output len
prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
lora_request=lora_request,
))
return sampled_requests
@@ -606,7 +626,7 @@ class HuggingFaceDataset(BenchmarkDataset):
# -----------------------------------------------------------------------------
class VisionArenaDataset(BenchmarkDataset):
class VisionArenaDataset(HuggingFaceDataset):
"""
Vision Arena Dataset.
"""
@@ -617,14 +637,9 @@ class VisionArenaDataset(BenchmarkDataset):
def __init__(
self,
dataset_split: str,
dataset_subset: Optional[str] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.dataset_split = dataset_split
self.dataset_subset = dataset_subset
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.")
@@ -645,9 +660,9 @@ class VisionArenaDataset(BenchmarkDataset):
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: int = DEFAULT_OUTPUT_LEN,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
# TODO (jenniferzhao): Add support for offline benchmark sampling
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
@@ -655,8 +670,14 @@ class VisionArenaDataset(BenchmarkDataset):
if len(sampled_requests) >= num_requests:
break
prompt = item["turns"][0][0]["content"]
prompt_len = len(tokenizer(prompt).input_ids)
mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len
prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append(
SampleRequest(
prompt=prompt,