diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 55284706e..3a05440e4 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -319,9 +319,6 @@ def _compare_tp( pp_env = { "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1", } - # Temporary. Currently when zeromq + SPMD is used, it does not properly - # terminate because of a Ray Compiled Graph issue. - common_args.append("--disable-frontend-multiprocessing") elif distributed_backend == "mp": pp_env = None else: diff --git a/tests/entrypoints/instrumentator/test_basic.py b/tests/entrypoints/instrumentator/test_basic.py index 9c2986ebe..5f48fb266 100644 --- a/tests/entrypoints/instrumentator/test_basic.py +++ b/tests/entrypoints/instrumentator/test_basic.py @@ -28,7 +28,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]: >>> @pytest.mark.parametrize( >>> "server_args", >>> [ - >>> ["--disable-frontend-multiprocessing"], + >>> ["--max-model-len", "10100"], >>> [ >>> "--model=NousResearch/Hermes-3-Llama-3.1-70B", >>> "--enable-auto-tool-choice", @@ -40,7 +40,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]: >>> ... This will run `test_foo` twice with servers with: - - `--disable-frontend-multiprocessing` + - `--max-model-len 10100` - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`. """ @@ -79,17 +79,6 @@ async def client(server): yield async_client -@pytest.mark.parametrize( - "server_args", - [ - pytest.param([], id="default-frontend-multiprocessing"), - pytest.param( - ["--disable-frontend-multiprocessing"], - id="disable-frontend-multiprocessing", - ), - ], - indirect=True, -) @pytest.mark.asyncio async def test_show_version(server: RemoteOpenAIServer): response = requests.get(server.url_for("version")) @@ -98,17 +87,6 @@ async def test_show_version(server: RemoteOpenAIServer): assert response.json() == {"version": VLLM_VERSION} -@pytest.mark.parametrize( - "server_args", - [ - pytest.param([], id="default-frontend-multiprocessing"), - pytest.param( - ["--disable-frontend-multiprocessing"], - id="disable-frontend-multiprocessing", - ), - ], - indirect=True, -) @pytest.mark.asyncio async def test_check_health(server: RemoteOpenAIServer): response = requests.get(server.url_for("health")) @@ -119,13 +97,7 @@ async def test_check_health(server: RemoteOpenAIServer): @pytest.mark.parametrize( "server_args", [ - pytest.param( - ["--max-model-len", "10100"], id="default-frontend-multiprocessing" - ), - pytest.param( - ["--disable-frontend-multiprocessing", "--max-model-len", "10100"], - id="disable-frontend-multiprocessing", - ), + pytest.param(["--max-model-len", "10100"]), ], indirect=True, ) diff --git a/tests/entrypoints/instrumentator/test_metrics.py b/tests/entrypoints/instrumentator/test_metrics.py index 19d1234c3..ba4e65977 100644 --- a/tests/entrypoints/instrumentator/test_metrics.py +++ b/tests/entrypoints/instrumentator/test_metrics.py @@ -50,7 +50,6 @@ def default_server_args(): params=[ "", "--enable-chunked-prefill", - "--disable-frontend-multiprocessing", f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}", ], ) diff --git a/tests/entrypoints/openai/completion/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/completion/test_completion_with_prompt_embeds.py index 5ca907b89..24f662591 100644 --- a/tests/entrypoints/openai/completion/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/completion/test_completion_with_prompt_embeds.py @@ -83,11 +83,8 @@ def example_prompt_embeds(hf_runner): return [_encode_embeds(item) for item in example_embeddings] -@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"]) -def server_with_prompt_embeds(default_server_args, request): - if request.param: - default_server_args.append(request.param) - +@pytest.fixture(scope="module") +def server_with_prompt_embeds(default_server_args): with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/completion/test_shutdown.py b/tests/entrypoints/openai/completion/test_shutdown.py index 43f57719a..80d00bd23 100644 --- a/tests/entrypoints/openai/completion/test_shutdown.py +++ b/tests/entrypoints/openai/completion/test_shutdown.py @@ -150,7 +150,6 @@ async def test_shutdown_on_engine_failure(): "0.05", "--max-num-seqs", "2", - "--disable-frontend-multiprocessing", ], # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when # stdout/stderr pipes are enabled during ROCm GPU initialization. diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 7faf25220..bbb8c104f 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -26,19 +26,12 @@ def default_server_args(): "128", "--enforce-eager", "--enable-prompt-tokens-details", + "--no-enable-prefix-caching", ] -@pytest.fixture( - scope="module", - params=[ - ["--no-enable-prefix-caching"], - ["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"], - ], -) -def server(default_server_args, request): - if request.param: - default_server_args = default_server_args + request.param +@pytest.fixture(scope="module") +def server(default_server_args): with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: yield remote_server diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 4c6379d67..f7cea8bdd 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -181,7 +181,6 @@ async def run_vllm_async( n: int, engine_args: AsyncEngineArgs, do_profile: bool, - disable_frontend_multiprocessing: bool = False, disable_detokenize: bool = False, ) -> float: from vllm import SamplingParams @@ -191,7 +190,6 @@ async def run_vllm_async( async with build_async_engine_client_from_engine_args( engine_args, - disable_frontend_multiprocessing=disable_frontend_multiprocessing, ) as llm: model_config = llm.model_config assert all( @@ -757,12 +755,6 @@ def add_cli_args(parser: argparse.ArgumentParser): default=False, help="Use vLLM async engine rather than LLM class.", ) - parser.add_argument( - "--disable-frontend-multiprocessing", - action="store_true", - default=False, - help="Disable decoupled async engine frontend.", - ) parser.add_argument( "--disable-detokenize", action="store_true", @@ -880,7 +872,6 @@ def main(args: argparse.Namespace): requests, args.n, AsyncEngineArgs.from_cli_args(args), - disable_frontend_multiprocessing=args.disable_frontend_multiprocessing, disable_detokenize=args.disable_detokenize, do_profile=args.profile, ) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 39e9076a7..4d5c5eae8 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -79,7 +79,6 @@ async def build_async_engine_client( args: Namespace, *, usage_context: UsageContext = UsageContext.OPENAI_API_SERVER, - disable_frontend_multiprocessing: bool | None = None, client_config: dict[str, Any] | None = None, ) -> AsyncIterator[EngineClient]: if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver": @@ -98,13 +97,9 @@ async def build_async_engine_client( engine_args._api_process_count = client_config.get("client_count", 1) engine_args._api_process_rank = client_config.get("client_index", 0) - if disable_frontend_multiprocessing is None: - disable_frontend_multiprocessing = bool(args.disable_frontend_multiprocessing) - async with build_async_engine_client_from_engine_args( engine_args, usage_context=usage_context, - disable_frontend_multiprocessing=disable_frontend_multiprocessing, client_config=client_config, ) as engine: yield engine @@ -115,7 +110,6 @@ async def build_async_engine_client_from_engine_args( engine_args: AsyncEngineArgs, *, usage_context: UsageContext = UsageContext.OPENAI_API_SERVER, - disable_frontend_multiprocessing: bool = False, client_config: dict[str, Any] | None = None, ) -> AsyncIterator[EngineClient]: """ @@ -129,9 +123,6 @@ async def build_async_engine_client_from_engine_args( # Create the EngineConfig (determines if we can use V1). vllm_config = engine_args.create_engine_config(usage_context=usage_context) - if disable_frontend_multiprocessing: - logger.warning("V1 is enabled, but got --disable-frontend-multiprocessing.") - from vllm.v1.engine.async_llm import AsyncLLM async_llm: AsyncLLM | None = None diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index ab28b6299..2bd991b00 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -105,9 +105,6 @@ class BaseFrontendArgs: """When `--max-logprobs` is specified, represents single tokens as strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified.""" - disable_frontend_multiprocessing: bool = False - """If specified, will run the OpenAI frontend server in the same process as - the model serving engine.""" enable_auto_tool_choice: bool = False """Enable auto tool choice for supported models. Use `--tool-call-parser` to specify which parser to use.""" diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index e244ffd71..03a15991d 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -823,7 +823,6 @@ async def main(args: Namespace): async with build_async_engine_client( args, usage_context=UsageContext.OPENAI_BATCH_RUNNER, - disable_frontend_multiprocessing=False, ) as engine_client: await run_batch(engine_client, args)