kv_transfer: Rename the shared storage connectors (#30201)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
This commit is contained in:
@@ -10,7 +10,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
|
||||
- `decode_example.py` – performs the decode stage. Accepts:
|
||||
- `--simulate-failure`: simulates KV load failure using a custom connector.
|
||||
- `--async-load`: enables asynchronous KV loading mode.
|
||||
- `rogue_shared_storage_connector.py` – defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
|
||||
- `load_recovery_example_connector.py` – defines `LoadRecoveryExampleConnector`, a subclass of `ExampleConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
|
||||
- `run.sh` – orchestrates the test: runs the prefill stage, then three decode stages:
|
||||
1. Normal decode (baseline).
|
||||
2. Decode with simulated sync KV load failure.
|
||||
@@ -20,7 +20,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
|
||||
|
||||
## How It Works
|
||||
|
||||
- The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
|
||||
- The test dynamically loads `LoadRecoveryExampleConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
|
||||
- The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode.
|
||||
- If recovery fails, the script prints a unified diff of the output mismatch and exits with error.
|
||||
|
||||
|
||||
@@ -35,13 +35,13 @@ def main():
|
||||
|
||||
if args.simulate_failure:
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="RogueSharedStorageConnector",
|
||||
kv_connector="LoadRecoveryExampleConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"shared_storage_path": "local_storage",
|
||||
"async_load": args.async_load,
|
||||
},
|
||||
kv_connector_module_path="rogue_shared_storage_connector",
|
||||
kv_connector_module_path="load_recovery_example_connector",
|
||||
)
|
||||
out_file = (
|
||||
"async_decode_recovered_output.txt"
|
||||
@@ -50,7 +50,7 @@ def main():
|
||||
)
|
||||
else:
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="SharedStorageConnector",
|
||||
kv_connector="ExampleConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"shared_storage_path": "local_storage",
|
||||
|
||||
@@ -10,9 +10,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorMetadata,
|
||||
KVConnectorRole,
|
||||
)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
|
||||
SharedStorageConnector,
|
||||
SharedStorageConnectorMetadata,
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (
|
||||
ExampleConnector,
|
||||
ExampleConnectorMetadata,
|
||||
)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
@@ -26,15 +26,15 @@ logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata):
|
||||
class LoadRecoveryExampleConnectorMetadata(ExampleConnectorMetadata):
|
||||
req_to_block_ids: dict[str, set[int]] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
def from_base(cls, base: SharedStorageConnectorMetadata):
|
||||
def from_base(cls, base: ExampleConnectorMetadata):
|
||||
return cls(requests=base.requests)
|
||||
|
||||
|
||||
class RogueSharedStorageConnector(SharedStorageConnector):
|
||||
class LoadRecoveryExampleConnector(ExampleConnector):
|
||||
def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
|
||||
super().__init__(vllm_config=vllm_config, role=role)
|
||||
self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
|
||||
@@ -45,7 +45,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
|
||||
self._req_to_block_ids: dict[str, list[int]] = dict()
|
||||
|
||||
def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
|
||||
assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata)
|
||||
assert isinstance(connector_metadata, LoadRecoveryExampleConnectorMetadata)
|
||||
index, failed_request = next(
|
||||
(
|
||||
(i, x)
|
||||
@@ -84,7 +84,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
|
||||
) -> tuple[set[str] | None, set[str] | None]:
|
||||
if self._async_load:
|
||||
meta = self._get_connector_metadata()
|
||||
assert isinstance(meta, RogueSharedStorageConnectorMetadata)
|
||||
assert isinstance(meta, LoadRecoveryExampleConnectorMetadata)
|
||||
if meta.req_to_block_ids:
|
||||
return None, set(meta.req_to_block_ids)
|
||||
|
||||
@@ -126,9 +126,9 @@ class RogueSharedStorageConnector(SharedStorageConnector):
|
||||
) -> KVConnectorMetadata:
|
||||
if not self._async_load:
|
||||
base = super().build_connector_meta(scheduler_output)
|
||||
meta = RogueSharedStorageConnectorMetadata.from_base(base)
|
||||
meta = LoadRecoveryExampleConnectorMetadata.from_base(base)
|
||||
else:
|
||||
meta = RogueSharedStorageConnectorMetadata()
|
||||
meta = LoadRecoveryExampleConnectorMetadata()
|
||||
if self._requests_need_load:
|
||||
for req_id, request in self._requests_need_load.items():
|
||||
meta.add_request(
|
||||
@@ -26,7 +26,7 @@ def main():
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.8,
|
||||
kv_transfer_config=KVTransferConfig(
|
||||
kv_connector="SharedStorageConnector",
|
||||
kv_connector="ExampleConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={"shared_storage_path": "local_storage"},
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user