[core] remove beam search from the core (#9105)

2024-10-06 22:47:04 -07:00
parent c8f26bb636
commit 18b296fdb2
25 changed files with 98 additions and 596 deletions
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -23,7 +23,6 @@ class RequestFuncInput:
    output_len: int
    model: str
    best_of: int = 1
-    use_beam_search: bool = False
    logprobs: Optional[int] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False
@@ -49,7 +48,6 @@ async def async_request_tgi(
    assert api_url.endswith("generate_stream")

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        params = {
            "best_of": request_func_input.best_of,
            "max_new_tokens": request_func_input.output_len,
@@ -121,7 +119,6 @@ async def async_request_trt_llm(
    assert api_url.endswith("generate_stream")

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        assert request_func_input.best_of == 1
        payload = {
            "accumulate_tokens": True,
@@ -187,7 +184,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1
-        assert not request_func_input.use_beam_search

        payload = {
            "prompt": request_func_input.prompt,
@@ -235,7 +231,6 @@ async def async_request_openai_completions(
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        payload = {
            "model": request_func_input.model,
            "prompt": request_func_input.prompt,
@@ -317,7 +312,6 @@ async def async_request_openai_chat_completions(
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -51,9 +51,8 @@ def main(args: argparse.Namespace):

    sampling_params = SamplingParams(
        n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
+        temperature=1.0,
        top_p=1.0,
-        use_beam_search=args.use_beam_search,
        ignore_eos=True,
        max_tokens=args.output_len,
    )
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -68,7 +68,6 @@ def run_vllm(
    tensor_parallel_size: int,
    seed: int,
    n: int,
-    use_beam_search: bool,
    trust_remote_code: bool,
    dtype: str,
    max_model_len: Optional[int],
@@ -114,9 +113,8 @@ def run_vllm(
        sampling_params.append(
            SamplingParams(
                n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                top_p=1.0,
-                use_beam_search=use_beam_search,
                ignore_eos=True,
                max_tokens=output_len,
            ))
@@ -144,15 +142,16 @@ def main(args: argparse.Namespace):
                                   args.output_len)

    if args.backend == "vllm":
-        elapsed_time = run_vllm(
-            requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype,
-            args.quantization_param_path, args.device,
-            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
-            args.download_dir)
+        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
+                                args.quantization, args.tensor_parallel_size,
+                                args.seed, args.n, args.trust_remote_code,
+                                args.dtype, args.max_model_len,
+                                args.enforce_eager, args.kv_cache_dtype,
+                                args.quantization_param_path, args.device,
+                                args.enable_prefix_caching,
+                                args.enable_chunked_prefill,
+                                args.max_num_batched_tokens,
+                                args.gpu_memory_utilization, args.download_dir)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
    total_num_tokens = sum(prompt_len + output_len
@@ -203,7 +202,6 @@ if __name__ == "__main__":
                        type=int,
                        default=1,
                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument("--num-prompts",
                        type=int,
                        default=200,
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -391,7 +391,6 @@ async def benchmark(
    input_requests: List[Tuple[str, int, int]],
    logprobs: Optional[int],
    best_of: int,
-    use_beam_search: bool,
    request_rate: float,
    disable_tqdm: bool,
    profile: bool,
@@ -419,7 +418,6 @@ async def benchmark(
        output_len=test_output_len,
        logprobs=logprobs,
        best_of=best_of,
-        use_beam_search=use_beam_search,
        multi_modal_content=test_mm_content,
        ignore_eos=ignore_eos,
    )
@@ -441,7 +439,6 @@ async def benchmark(
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
-            use_beam_search=use_beam_search,
            multi_modal_content=test_mm_content,
        )
        profile_output = await request_func(request_func_input=profile_input)
@@ -464,7 +461,6 @@ async def benchmark(
            output_len=output_len,
            logprobs=logprobs,
            best_of=best_of,
-            use_beam_search=use_beam_search,
            multi_modal_content=mm_content,
        )
        tasks.append(
@@ -483,7 +479,6 @@ async def benchmark(
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
-            use_beam_search=use_beam_search,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
@@ -679,7 +674,6 @@ def main(args: argparse.Namespace):
            input_requests=input_requests,
            logprobs=args.logprobs,
            best_of=args.best_of,
-            use_beam_search=args.use_beam_search,
            request_rate=args.request_rate,
            disable_tqdm=args.disable_tqdm,
            profile=args.profile,
@@ -701,7 +695,6 @@ def main(args: argparse.Namespace):
        result_json["model_id"] = model_id
        result_json["tokenizer_id"] = tokenizer_id
        result_json["best_of"] = args.best_of
-        result_json["use_beam_search"] = args.use_beam_search
        result_json["num_prompts"] = args.num_prompts

        # Metadata
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -73,7 +73,6 @@ def run_vllm(
    tensor_parallel_size: int,
    seed: int,
    n: int,
-    use_beam_search: bool,
    trust_remote_code: bool,
    dtype: str,
    max_model_len: Optional[int],
@@ -91,7 +90,6 @@ def run_vllm(
    download_dir: Optional[str] = None,
    load_format: str = EngineArgs.load_format,
    disable_async_output_proc: bool = False,
-    use_new_beam_search_impl: bool = False,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@@ -127,19 +125,19 @@ def run_vllm(
        sampling_params.append(
            SamplingParams(
                n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                top_p=1.0,
-                use_beam_search=use_beam_search,
                ignore_eos=True,
                max_tokens=output_len,
            ))

-    if not use_new_beam_search_impl:
+    use_beam_search = False
+
+    if not use_beam_search:
        start = time.perf_counter()
        llm.generate(prompts, sampling_params, use_tqdm=True)
        end = time.perf_counter()
    else:
-        assert use_beam_search
        prompts = [prompt for prompt, _, _ in requests]
        # output_len should be the same for all requests.
        output_len = requests[0][2]
@@ -165,7 +163,6 @@ async def run_vllm_async(
    tensor_parallel_size: int,
    seed: int,
    n: int,
-    use_beam_search: bool,
    trust_remote_code: bool,
    dtype: str,
    max_model_len: Optional[int],
@@ -224,9 +221,8 @@ async def run_vllm_async(
            sampling_params.append(
                SamplingParams(
                    n=n,
-                    temperature=0.0 if use_beam_search else 1.0,
+                    temperature=1.0,
                    top_p=1.0,
-                    use_beam_search=use_beam_search,
                    ignore_eos=True,
                    max_tokens=output_len,
                ))
@@ -248,11 +244,9 @@ def run_hf(
    model: str,
    tokenizer: PreTrainedTokenizerBase,
    n: int,
-    use_beam_search: bool,
    max_batch_size: int,
    trust_remote_code: bool,
 ) -> float:
-    assert not use_beam_search
    llm = AutoModelForCausalLM.from_pretrained(
        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
    if llm.config.model_type == "llama":
@@ -284,7 +278,7 @@ def run_hf(
                              padding=True).input_ids
        llm_outputs = llm.generate(
            input_ids=input_ids.cuda(),
-            do_sample=not use_beam_search,
+            do_sample=True,
            num_return_sequences=n,
            temperature=1.0,
            top_p=1.0,
@@ -340,7 +334,7 @@ def main(args: argparse.Namespace):
    if args.backend == "vllm":
        run_args = [
            requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.tensor_parallel_size, args.seed, args.n,
            args.trust_remote_code, args.dtype, args.max_model_len,
            args.enforce_eager, args.kv_cache_dtype,
            args.quantization_param_path, args.device,
@@ -355,12 +349,11 @@ def main(args: argparse.Namespace):
            run_args.append(args.disable_frontend_multiprocessing)
            elapsed_time = uvloop.run(run_vllm_async(*run_args))
        else:
-            elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl)
+            elapsed_time = run_vllm(*run_args)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code)
    elif args.backend == "mii":
        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                               args.output_len)
@@ -414,8 +407,6 @@ if __name__ == "__main__":
                        type=int,
                        default=1,
                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument("--use-new-beam-search-impl", action="store_true")
    parser.add_argument("--num-prompts",
                        type=int,
                        default=1000,
@@ -570,8 +561,6 @@ if __name__ == "__main__":
            raise ValueError("dtype must be auto for MII backend.")
        if args.n != 1:
            raise ValueError("n must be 1 for MII backend.")
-        if args.use_beam_search:
-            raise ValueError("Beam search is not supported for MII backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
        if args.hf_max_batch_size is not None: