diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 17cc2984f..f06f41a47 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -2072,32 +2072,38 @@ class CustomDataset(BenchmarkDataset): break prompt = item["prompt"] - new_output_len = output_len - if output_len is None or output_len == -1: - # check that the request has an 'output_tokens' field - if "output_tokens" not in item: - raise ValueError( - "If no output length is provided the " - "custom dataset must contain an 'output_tokens' field." + if tokenizer is None: + new_output_len = 1 + else: + new_output_len = output_len + if output_len is None or output_len == -1: + # check that the request has an 'output_tokens' field + if "output_tokens" not in item: + raise ValueError( + "If no output length is provided the " + "custom dataset must contain an 'output_tokens' field." + ) + # Use number of output tokens from the request data + try: + new_output_len = int(item["output_tokens"]) + except (ValueError, TypeError) as e: + raise ValueError( + f"Invalid value for 'output_tokens' in custom dataset: " + f"'{item['output_tokens']}'. Must be an integer." + ) from e + + if tokenizer is None: + prompt_len = 1 + else: + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, ) - # Use number of output tokens from the request data - try: - new_output_len = int(item["output_tokens"]) - except (ValueError, TypeError) as e: - raise ValueError( - f"Invalid value for 'output_tokens' in custom dataset: " - f"'{item['output_tokens']}'. Must be an integer." - ) from e - # apply template - if not skip_chat_template: - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) + prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( SampleRequest( prompt=prompt, diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index cccbcdb83..e231ccf6e 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -746,6 +746,37 @@ async def async_request_infinity_embeddings_clip( ) +async def async_request_vllm_pooling( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "vLLM Pooling API", "pooling") + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "truncate_prompt_tokens": -1, + } + + payload = payload | request_func_input.prompt + + _update_payload_common(payload, request_func_input) + + headers = _get_headers("application/json") + _update_headers_common(headers, request_func_input) + + return await _run_pooling_request( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + # TODO: Add more request functions for different API protocols. ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "vllm": async_request_openai_completions, @@ -760,6 +791,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "infinity-embeddings": async_request_infinity_embeddings, "infinity-embeddings-clip": async_request_infinity_embeddings_clip, # (Infinity embedding server does not support vlm2vec) + "vllm-pooling": async_request_vllm_pooling, "vllm-rerank": async_request_vllm_rerank, } diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index a1361fb80..534392883 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -423,16 +423,19 @@ def calculate_metrics( output_len = outputs[i].output_tokens if not output_len: - # We use the tokenizer to count the number of output tokens - # for some serving backends instead of looking at - # len(outputs[i].itl) since multiple output tokens may be - # bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer( - outputs[i].generated_text, add_special_tokens=False - ).input_ids - ) + if tokenizer is None: + output_len = 1 + else: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer( + outputs[i].generated_text, add_special_tokens=False + ).input_ids + ) actual_output_lens.append(output_len) total_input += input_requests[i].prompt_len tpot = 0 @@ -919,7 +922,7 @@ async def benchmark( print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - if isinstance(metrics, BenchmarkMetrics): + if isinstance(metrics, BenchmarkMetrics) and tokenizer: print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) print( "{:<40} {:<10.2f}".format( @@ -933,16 +936,18 @@ async def benchmark( ) ) if isinstance(metrics, BenchmarkMetrics): - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput + if tokenizer: + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", metrics.output_throughput + ) ) - ) - print( - "{:<40} {:<10.2f}".format( - "Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s + print( + "{:<40} {:<10.2f}".format( + "Peak output token throughput (tok/s):", + metrics.max_output_tokens_per_s, + ) ) - ) print( "{:<40} {:<10.2f}".format( "Peak concurrent requests:", metrics.max_concurrent_requests @@ -954,11 +959,12 @@ async def benchmark( "RTFx (Inverse Real-Time Factor):", metrics.rtfx ) ) - print( - "{:<40} {:<10.2f}".format( - "Total token throughput (tok/s):", metrics.total_token_throughput + if tokenizer: + print( + "{:<40} {:<10.2f}".format( + "Total token throughput (tok/s):", metrics.total_token_throughput + ) ) - ) if isinstance(metrics, BenchmarkMetrics): result = { @@ -1047,7 +1053,7 @@ async def benchmark( print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) result[f"p{p_word}_{metric_attribute_name}_ms"] = value - if task_type == TaskType.GENERATION: + if task_type == TaskType.GENERATION and tokenizer: process_one_metric("ttft", "TTFT", "Time to First Token") process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") @@ -1519,6 +1525,12 @@ def add_cli_args(parser: argparse.ArgumentParser): type=json.loads, default=None, ) + parser.add_argument( + "--skip-tokenizer-init", + action="store_true", + default=False, + help="Skip initialization of tokenizer and detokenizer", + ) parser.add_argument( "--insecure", @@ -1599,14 +1611,18 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: model_name = args.served_model_name model_id = args.model - tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id - tokenizer_mode = args.tokenizer_mode - - tokenizer = get_tokenizer( - tokenizer_id, - tokenizer_mode=tokenizer_mode, - trust_remote_code=args.trust_remote_code, - ) + if args.skip_tokenizer_init: + tokenizer_id = None + tokenizer_mode = None + tokenizer = None + else: + tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id + tokenizer_mode = args.tokenizer_mode + tokenizer = get_tokenizer( + tokenizer_id, + tokenizer_mode=tokenizer_mode, + trust_remote_code=args.trust_remote_code, + ) if args.dataset_name is None: raise ValueError(