[V0 Deprecation] Remove LLMEngine (#25033)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-20 17:56:30 -07:00
parent 367a480bd3
commit 52c2a8d4ad
29 changed files with 65 additions and 2763 deletions
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -36,9 +36,6 @@ from ..utils import check_logprobs_close
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive for
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models(
    vllm_runner,
    example_prompts,
@@ -49,7 +46,6 @@ def test_models(
    enforce_eager: bool,
    backend: str,
    tensor_parallel_size: int,
-    disable_async_output_proc: bool,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
@@ -74,7 +70,6 @@ def test_models(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -85,7 +80,6 @@ def test_models(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -110,9 +104,6 @@ def test_models(
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
-# Due to low-precision numerical divergence, this test is too sensitive for
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_cpu_models(
    vllm_runner,
    example_prompts,
@@ -120,7 +111,6 @@ def test_cpu_models(
    base_model: str,
    test_model: str,
    max_tokens: int,
-    disable_async_output_proc: bool,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
@@ -138,7 +128,6 @@ def test_cpu_models(
                max_model_len=MAX_MODEL_LEN,
                dtype="bfloat16",
                kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -148,7 +137,6 @@ def test_cpu_models(
                max_model_len=MAX_MODEL_LEN,
                dtype="bfloat16",
                kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)