[Core] Encoder separation for Encode-Prefill-Decode Disaggregation (#25233)

Signed-off-by: n00909098 <nguyen.kha.long@huawei.com>
Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Signed-off-by: herotai214 <herotai214@gmail.com>
Signed-off-by: Khuong Le <khuong.le.manh@huawei.com>
Signed-off-by: Khuong Le <lemanhkhuong2611@gmail.com>
Co-authored-by: n00909098 <nguyen.kha.long@huawei.com>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: herotai214 <herotai214@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Khuong Le <khuong.le.manh@huawei.com>
Co-authored-by: Khuong Le <lemanhkhuong2611@gmail.com>
This commit is contained in:
Chenguang Zheng
2025-11-12 10:58:33 +08:00
committed by GitHub
parent cbb799e314
commit 4ccffe561f
31 changed files with 5026 additions and 42 deletions

View File

@@ -10,6 +10,14 @@ import pytest
from transformers import AutoTokenizer
from vllm import SamplingParams
from vllm.config import (
CacheConfig,
ECTransferConfig,
KVTransferConfig,
ModelConfig,
SchedulerConfig,
VllmConfig,
)
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_default_torch_num_threads
@@ -450,3 +458,141 @@ def test_engine_core_invalid_request_id_type():
engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
@create_new_process_for_each_test()
@pytest.mark.parametrize(
("ec_role", "gpu_memory_utilization", "enable_prefix_caching"),
[
("ec_producer", 0.01, False),
# NOTE: ec_producer never allows prefix caching
("ec_consumer", 0.7, True),
("ec_consumer", 0.7, False),
],
)
@pytest.mark.parametrize("use_kv_connector", [False, True])
def test_encoder_instance_zero_kv_cache(
ec_role: str,
gpu_memory_utilization: float,
enable_prefix_caching: bool,
use_kv_connector: bool,
):
"""EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests
This test verifies encoder-only instance initializes with 0 KV cache blocks.
Under EPD disagg mode, Encoder instances (EC producer role) only execute
vision encoder, so they don't need KV cache for text generation.
"""
# Form vllm config
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
max_model_len=512,
disable_hybrid_kv_cache_manager=True,
)
model_config = ModelConfig(
model="llava-hf/llava-1.5-7b-hf", # Multimodal model
enforce_eager=True,
trust_remote_code=True,
dtype="float16",
seed=42,
)
cache_config = CacheConfig(
block_size=16,
gpu_memory_utilization=gpu_memory_utilization,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=enable_prefix_caching,
)
kv_transfer_config = (
KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": "local_storage"},
)
if use_kv_connector
else None
)
ec_transfer_config = ECTransferConfig(
ec_connector="ECSharedStorageConnector",
ec_role=ec_role,
ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test_encoder"},
)
vllm_config = VllmConfig(
model_config=model_config,
cache_config=cache_config,
scheduler_config=scheduler_config,
kv_transfer_config=kv_transfer_config,
ec_transfer_config=ec_transfer_config,
)
executor_class = Executor.get_class(vllm_config)
print(f"executor_class: {executor_class}")
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
# Check encoder cache manager exists
assert engine_core.scheduler.encoder_cache_manager is not None, (
"encoder_cache_manager should exist"
)
if ec_role == "ec_producer":
# Check 1: num_blocks should be 0
# NOTE: num_blocks=1 as BlockPool always needs a null_block.
kv_cache_config = engine_core.scheduler.kv_cache_manager.kv_cache_config
print(f"kv_cache_config: {kv_cache_config}")
assert kv_cache_config.num_blocks == 1, (
f"ec_producer should only have 1 KV blocks, "
f"got {kv_cache_config.num_blocks}"
)
# Check 2: kv_cache_groups should be empty
assert len(kv_cache_config.kv_cache_groups) == 0, (
f"ec_producer should have 0 KV cache groups, "
f"got {len(kv_cache_config.kv_cache_groups)}"
)
# Check 3: kv_cache_tensors should be empty
assert len(kv_cache_config.kv_cache_tensors) == 0, (
f"Encoder instance should have 0 KV cache tensors, "
f"got {len(kv_cache_config.kv_cache_tensors)}"
)
# Check 4: Verify EC connector is initialized and is producer
assert engine_core.scheduler.ec_connector is not None, (
"Encoder instance should have EC connector"
)
assert engine_core.scheduler.ec_connector.is_producer, (
"Encoder instance EC connector should be producer"
)
# Check 5: Verify chunked prefill is disabled
assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
"Encoder instance should disable chunked prefill (no KV cache)"
)
elif ec_role == "ec_consumer":
# Check 1: num_blocks should be > 1
kv_cache_config = engine_core.scheduler.kv_cache_manager.kv_cache_config
print(f"kv_cache_config: {kv_cache_config}")
assert kv_cache_config.num_blocks > 1, (
f"ec_consumer should have >1 KV blocks, got {kv_cache_config.num_blocks}"
)
# Check 2: kv_cache_groups should NOT be empty
assert len(kv_cache_config.kv_cache_groups) > 0, (
f"ec_consumer should have KV cache groups, "
f"got {len(kv_cache_config.kv_cache_groups)}"
)
# Check 3: Verify EC connector is consumer
assert engine_core.scheduler.ec_connector is not None, (
"Consumer instance should have EC connector"
)
assert not engine_core.scheduler.ec_connector.is_producer, (
"Consumer instance EC connector should be consumer"
)