[Core] Asynchronous Output Processor (#7049)

Co-authored-by: Alexander Matveev <alexm@neuralmagic.com>
2024-08-26 20:53:20 -07:00
parent 015e6cc252
commit 2eedede875
21 changed files with 652 additions and 214 deletions
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -88,6 +88,9 @@ def test_models(
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
+# Due to low-precision numerical divergence, this test is too sensitive to
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models_with_fp8_kv_cache(
    vllm_runner,
    example_prompts,
@@ -97,6 +100,7 @@ def test_models_with_fp8_kv_cache(
    chunked_prefill_token_size: int,
    enforce_eager: bool,
    tensor_parallel_size: int,
+    disable_async_output_proc: bool,
 ) -> None:
    """
    Only checks log probs match between chunked-prefill and
@@ -126,6 +130,7 @@ def test_models_with_fp8_kv_cache(
            enforce_eager=enforce_eager,
            max_num_seqs=max_num_seqs,
            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
            **extra_kwargs,
    ) as vllm_model:
        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
@@ -139,6 +144,7 @@ def test_models_with_fp8_kv_cache(
            enforce_eager=enforce_eager,
            max_num_seqs=max_num_seqs,
            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
            **extra_kwargs,
    ) as vllm_model:
        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(