[Core] Deprecating block manager v1 and make block manager v2 default (#8704)
Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
This commit is contained in:
@@ -8,7 +8,6 @@ from vllm.core.interfaces import AllocStatus
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.sequence import Logprob, SequenceGroup
|
||||
|
||||
from ..utils import check_deprecated_block_manager_usage
|
||||
from .utils import create_dummy_prompt
|
||||
|
||||
|
||||
@@ -28,25 +27,16 @@ def schedule_and_update_computed_tokens(scheduler):
|
||||
return metas, out
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_deprecated_block_manager():
|
||||
check_deprecated_block_manager_usage(
|
||||
'tests/core/test_chunked_prefill_scheduler.py')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_simple(use_v2_block_manager: bool):
|
||||
def test_simple():
|
||||
"""Verify basic scheduling works."""
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_batched_tokens,
|
||||
num_seq_group,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
scheduler_config = SchedulerConfig(max_num_batched_tokens,
|
||||
num_seq_group,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
@@ -81,8 +71,7 @@ def test_simple(use_v2_block_manager: bool):
|
||||
assert len(seq_group_meta) == num_seq_group
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_chunk(use_v2_block_manager: bool):
|
||||
def test_chunk():
|
||||
"""Verify prefills are chunked properly."""
|
||||
block_size = 4
|
||||
max_seqs = 60
|
||||
@@ -93,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool):
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 32
|
||||
cache_config.num_gpu_blocks = 32
|
||||
@@ -131,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool):
|
||||
assert out.num_batched_tokens == 57
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_complex(use_v2_block_manager: bool):
|
||||
def test_complex():
|
||||
block_size = 4
|
||||
max_seqs = 60
|
||||
max_model_len = 80
|
||||
@@ -142,7 +130,7 @@ def test_complex(use_v2_block_manager: bool):
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 64
|
||||
cache_config.num_gpu_blocks = 64
|
||||
@@ -201,8 +189,7 @@ def test_complex(use_v2_block_manager: bool):
|
||||
assert running[2].is_prefill()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_maximal_decoding(use_v2_block_manager: bool):
|
||||
def test_maximal_decoding():
|
||||
"""Verify decoding requests are prioritized."""
|
||||
block_size = 4
|
||||
max_seqs = 2
|
||||
@@ -213,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
@@ -295,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
|
||||
assert out.num_batched_tokens == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_prompt_limit(use_v2_block_manager: bool):
|
||||
def test_prompt_limit():
|
||||
"""Verify max_num_batched_tokens < max_model_len is possible."""
|
||||
block_size = 4
|
||||
max_seqs = 32
|
||||
@@ -307,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
@@ -330,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
|
||||
assert out.num_batched_tokens == 32
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_prompt_limit_exceed(use_v2_block_manager: bool):
|
||||
def test_prompt_limit_exceed():
|
||||
block_size = 4
|
||||
max_seqs = 64
|
||||
max_model_len = 32
|
||||
@@ -356,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
|
||||
assert out.ignored_seq_groups[0] == seq_group
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_swap(use_v2_block_manager: bool):
|
||||
def test_swap():
|
||||
"""Verify swapping works with chunked prefill requests"""
|
||||
block_size = 4
|
||||
max_seqs = 30
|
||||
@@ -368,7 +352,7 @@ def test_swap(use_v2_block_manager: bool):
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
@@ -414,8 +398,7 @@ def test_swap(use_v2_block_manager: bool):
|
||||
assert out.blocks_to_swap_out == []
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
|
||||
def test_running_prefill_prioritized_over_swap():
|
||||
block_size = 4
|
||||
max_seqs = 30
|
||||
max_model_len = 200
|
||||
@@ -425,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 32
|
||||
cache_config.num_gpu_blocks = 32
|
||||
@@ -508,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
|
||||
assert out.blocks_to_swap_out == []
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_chunked_prefill_preempt(use_v2_block_manager: bool):
|
||||
def test_chunked_prefill_preempt():
|
||||
"""Verify preempt works with chunked prefill requests"""
|
||||
block_size = 4
|
||||
max_seqs = 30
|
||||
@@ -520,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
@@ -575,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
|
||||
def test_chunked_prefill_max_seqs():
|
||||
block_size = 4
|
||||
max_seqs = 2
|
||||
max_model_len = 80
|
||||
@@ -586,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 128
|
||||
cache_config.num_gpu_blocks = 128
|
||||
@@ -629,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
|
||||
assert not running[1].is_prefill()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
|
||||
def test_perfix_caching(use_v2_block_manager: bool):
|
||||
def test_perfix_caching():
|
||||
"""Verify allocating full blocks when prefix caching is enabled."""
|
||||
block_size = 4
|
||||
max_seqs = 10
|
||||
@@ -641,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
use_v2_block_manager=use_v2_block_manager)
|
||||
)
|
||||
cache_config = CacheConfig(block_size,
|
||||
1.0,
|
||||
1,
|
||||
|
||||
Reference in New Issue
Block a user