[Core] Deprecating block manager v1 and make block manager v2 default (#8704)

Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
This commit is contained in:
Kuntai Du
2024-10-17 11:38:15 -05:00
committed by GitHub
parent 5eda21e773
commit 81ede99ca4
45 changed files with 206 additions and 2109 deletions

View File

@@ -12,7 +12,7 @@ from contextlib import nullcontext
import pytest
from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import check_deprecated_block_manager_usage, multi_gpu_test
from ..utils import multi_gpu_test
MODELS = [
"facebook/opt-125m",
@@ -20,12 +20,6 @@ MODELS = [
]
@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/basic_correctness/test_chunked_prefill.py')
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@@ -197,7 +191,6 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("use_v2_block_manager", [False, True])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
@@ -206,7 +199,6 @@ def test_with_prefix_caching(
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
use_v2_block_manager: bool,
tensor_parallel_size: int,
) -> None:
"""
@@ -234,7 +226,6 @@ def test_with_prefix_caching(
enable_chunked_prefill=True,
enable_prefix_caching=enable,
tensor_parallel_size=tensor_parallel_size,
use_v2_block_manager=use_v2_block_manager,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
) as vllm_model: