[Core] Deprecating block manager v1 and make block manager v2 default (#8704)

Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
2024-10-17 11:38:15 -05:00
parent 5eda21e773
commit 81ede99ca4
45 changed files with 206 additions and 2109 deletions
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -12,7 +12,7 @@ from contextlib import nullcontext
 import pytest

 from ..models.utils import check_logprobs_close, check_outputs_equal
-from ..utils import check_deprecated_block_manager_usage, multi_gpu_test
+from ..utils import multi_gpu_test

 MODELS = [
    "facebook/opt-125m",
@@ -20,12 +20,6 @@ MODELS = [
 ]


-@pytest.fixture(scope="module", autouse=True)
-def check_deprecated_block_manager():
-    check_deprecated_block_manager_usage(
-        'tests/basic_correctness/test_chunked_prefill.py')
-
-
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@@ -197,7 +191,6 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("use_v2_block_manager", [False, True])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
@@ -206,7 +199,6 @@ def test_with_prefix_caching(
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
-    use_v2_block_manager: bool,
    tensor_parallel_size: int,
 ) -> None:
    """
@@ -234,7 +226,6 @@ def test_with_prefix_caching(
                enable_chunked_prefill=True,
                enable_prefix_caching=enable,
                tensor_parallel_size=tensor_parallel_size,
-                use_v2_block_manager=use_v2_block_manager,
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
        ) as vllm_model: