[Core] Deprecating block manager v1 and make block manager v2 default (#8704)

Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
2024-10-17 11:38:15 -05:00
parent 5eda21e773
commit 81ede99ca4
45 changed files with 206 additions and 2109 deletions
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -8,7 +8,6 @@ from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler
 from vllm.sequence import Logprob, SequenceGroup

-from ..utils import check_deprecated_block_manager_usage
 from .utils import create_dummy_prompt


@@ -28,25 +27,16 @@ def schedule_and_update_computed_tokens(scheduler):
    return metas, out


-@pytest.fixture(scope="module", autouse=True)
-def check_deprecated_block_manager():
-    check_deprecated_block_manager_usage(
-        'tests/core/test_chunked_prefill_scheduler.py')
-
-
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_simple(use_v2_block_manager: bool):
+def test_simple():
    """Verify basic scheduling works."""
    block_size = 4
    num_seq_group = 4
    max_model_len = 16
    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        max_num_batched_tokens,
-        num_seq_group,
-        max_model_len,
-        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       num_seq_group,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -81,8 +71,7 @@ def test_simple(use_v2_block_manager: bool):
    assert len(seq_group_meta) == num_seq_group


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_chunk(use_v2_block_manager: bool):
+def test_chunk():
    """Verify prefills are chunked properly."""
    block_size = 4
    max_seqs = 60
@@ -93,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 32
    cache_config.num_gpu_blocks = 32
@@ -131,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool):
    assert out.num_batched_tokens == 57


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_complex(use_v2_block_manager: bool):
+def test_complex():
    block_size = 4
    max_seqs = 60
    max_model_len = 80
@@ -142,7 +130,7 @@ def test_complex(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 64
    cache_config.num_gpu_blocks = 64
@@ -201,8 +189,7 @@ def test_complex(use_v2_block_manager: bool):
    assert running[2].is_prefill()


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_maximal_decoding(use_v2_block_manager: bool):
+def test_maximal_decoding():
    """Verify decoding requests are prioritized."""
    block_size = 4
    max_seqs = 2
@@ -213,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -295,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
    assert out.num_batched_tokens == 2


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_prompt_limit(use_v2_block_manager: bool):
+def test_prompt_limit():
    """Verify max_num_batched_tokens < max_model_len is possible."""
    block_size = 4
    max_seqs = 32
@@ -307,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -330,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
    assert out.num_batched_tokens == 32


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_prompt_limit_exceed(use_v2_block_manager: bool):
+def test_prompt_limit_exceed():
    block_size = 4
    max_seqs = 64
    max_model_len = 32
@@ -356,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
    assert out.ignored_seq_groups[0] == seq_group


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_swap(use_v2_block_manager: bool):
+def test_swap():
    """Verify swapping works with chunked prefill requests"""
    block_size = 4
    max_seqs = 30
@@ -368,7 +352,7 @@ def test_swap(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -414,8 +398,7 @@ def test_swap(use_v2_block_manager: bool):
    assert out.blocks_to_swap_out == []


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
+def test_running_prefill_prioritized_over_swap():
    block_size = 4
    max_seqs = 30
    max_model_len = 200
@@ -425,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 32
    cache_config.num_gpu_blocks = 32
@@ -508,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
    assert out.blocks_to_swap_out == []


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_chunked_prefill_preempt(use_v2_block_manager: bool):
+def test_chunked_prefill_preempt():
    """Verify preempt works with chunked prefill requests"""
    block_size = 4
    max_seqs = 30
@@ -520,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -575,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
    assert out.num_batched_tokens == max_num_batched_tokens


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
+def test_chunked_prefill_max_seqs():
    block_size = 4
    max_seqs = 2
    max_model_len = 80
@@ -586,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 128
    cache_config.num_gpu_blocks = 128
@@ -629,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
    assert not running[1].is_prefill()


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_perfix_caching(use_v2_block_manager: bool):
+def test_perfix_caching():
    """Verify allocating full blocks when prefix caching is enabled."""
    block_size = 4
    max_seqs = 10
@@ -641,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size,
                               1.0,
                               1,