[Docs] Improve V1 KVConnector interface documentation (#19172)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -8,9 +8,15 @@ The class provides the following primitives:
|
|||||||
Scheduler-side: runs in the scheduler, binds metadata, which
|
Scheduler-side: runs in the scheduler, binds metadata, which
|
||||||
is used by the worker-side to load/save KV cache.
|
is used by the worker-side to load/save KV cache.
|
||||||
get_num_new_matched_tokens() - get number of new tokens
|
get_num_new_matched_tokens() - get number of new tokens
|
||||||
that exist in the remote KV cache
|
that exist in the remote KV cache. Might be called multiple
|
||||||
|
times for a given request and should be side-effect free.
|
||||||
update_state_after_alloc() - update KVConnector state after
|
update_state_after_alloc() - update KVConnector state after
|
||||||
temporary buffer alloc by the CacheManager.
|
temporary buffer alloc by the CacheManager.
|
||||||
|
request_finished() - called when a request is finished, with
|
||||||
|
the computed kv cache blocks for the request.
|
||||||
|
Returns whether KV cache should be freed now or will be
|
||||||
|
freed asynchronously and optionally returns KV transfer
|
||||||
|
params.
|
||||||
|
|
||||||
Worker-side: runs in each worker, loads/saves KV cache to/from
|
Worker-side: runs in each worker, loads/saves KV cache to/from
|
||||||
the Connector based on the metadata.
|
the Connector based on the metadata.
|
||||||
@@ -19,6 +25,9 @@ The class provides the following primitives:
|
|||||||
|
|
||||||
save_kv_layer() - starts saving KV for layer i (maybe async)
|
save_kv_layer() - starts saving KV for layer i (maybe async)
|
||||||
wait_for_save() - blocks until all saves are done
|
wait_for_save() - blocks until all saves are done
|
||||||
|
|
||||||
|
get_finished() - called with ids of finished requests, returns
|
||||||
|
ids of requests that have completed async sending/recving.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import enum
|
import enum
|
||||||
@@ -184,7 +193,8 @@ class KVConnectorBase_V1(ABC):
|
|||||||
finished generating tokens.
|
finished generating tokens.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
ids of requests that have finished asynchronous transfer,
|
ids of requests that have finished asynchronous transfer
|
||||||
|
(requests that previously returned True from request_finished()),
|
||||||
tuple of (sending/saving ids, recving/loading ids).
|
tuple of (sending/saving ids, recving/loading ids).
|
||||||
The finished saves/sends req ids must belong to a set provided in a
|
The finished saves/sends req ids must belong to a set provided in a
|
||||||
call to this method (this call or a prior one).
|
call to this method (this call or a prior one).
|
||||||
@@ -215,7 +225,8 @@ class KVConnectorBase_V1(ABC):
|
|||||||
- The number of tokens that can be loaded from the
|
- The number of tokens that can be loaded from the
|
||||||
external KV cache beyond what is already computed.
|
external KV cache beyond what is already computed.
|
||||||
- `True` if external KV cache tokens will be loaded
|
- `True` if external KV cache tokens will be loaded
|
||||||
asynchronously (between scheduler steps).
|
asynchronously (between scheduler steps). Must be
|
||||||
|
'False' if the first element is 0.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -225,6 +236,18 @@ class KVConnectorBase_V1(ABC):
|
|||||||
num_external_tokens: int):
|
num_external_tokens: int):
|
||||||
"""
|
"""
|
||||||
Update KVConnector state after block allocation.
|
Update KVConnector state after block allocation.
|
||||||
|
|
||||||
|
If get_num_new_matched_tokens previously returned True for a
|
||||||
|
request, this function may be called twice for that same request -
|
||||||
|
first when blocks are allocated for the connector tokens to be
|
||||||
|
asynchronously loaded into, and second when any additional blocks
|
||||||
|
are allocated, after the load/transfer is complete.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request (Request): the request object.
|
||||||
|
blocks (KVCacheBlocks): the blocks allocated for the request.
|
||||||
|
num_external_tokens (int): the number of tokens that will be
|
||||||
|
loaded from the external KV cache.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
# This is flushed at the end of each scheduling step.
|
# This is flushed at the end of each scheduling step.
|
||||||
self.finished_req_ids: set[str] = set()
|
self.finished_req_ids: set[str] = set()
|
||||||
|
|
||||||
# P/D: requests in process of recving KV transfers
|
# KV Connector: requests in process of async KV loading or recving
|
||||||
self.finished_recving_kv_req_ids: set[str] = set()
|
self.finished_recving_kv_req_ids: set[str] = set()
|
||||||
|
|
||||||
# OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
|
# OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
|
||||||
@@ -822,7 +822,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
if not stopped:
|
if not stopped:
|
||||||
new_running.append(request)
|
new_running.append(request)
|
||||||
|
|
||||||
# P/D: update state for finished KV Transfers.
|
# KV Connector: update state for finished KV Transfers.
|
||||||
self._update_from_kv_xfer_finished(model_runner_output)
|
self._update_from_kv_xfer_finished(model_runner_output)
|
||||||
|
|
||||||
# Return the cached request data to the queue so they can be reused.
|
# Return the cached request data to the queue so they can be reused.
|
||||||
@@ -969,7 +969,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
self.kv_event_publisher.shutdown()
|
self.kv_event_publisher.shutdown()
|
||||||
|
|
||||||
########################################################################
|
########################################################################
|
||||||
# P/D Related Methods
|
# KV Connector Related Methods
|
||||||
########################################################################
|
########################################################################
|
||||||
|
|
||||||
def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
|
def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
|
||||||
@@ -992,7 +992,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
|
|
||||||
def _update_waiting_for_remote_kv(self, request: Request) -> bool:
|
def _update_waiting_for_remote_kv(self, request: Request) -> bool:
|
||||||
"""
|
"""
|
||||||
P/D: check if the request_id is finished_recving.
|
KV Connector: check if the request_id is finished_recving.
|
||||||
|
|
||||||
The finished_recving_kv_req_ids list is populated
|
The finished_recving_kv_req_ids list is populated
|
||||||
on the previous steps()'s update_from_output based
|
on the previous steps()'s update_from_output based
|
||||||
@@ -1029,7 +1029,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
def _update_from_kv_xfer_finished(self,
|
def _update_from_kv_xfer_finished(self,
|
||||||
model_runner_output: ModelRunnerOutput):
|
model_runner_output: ModelRunnerOutput):
|
||||||
"""
|
"""
|
||||||
P/D: update the scheduler state based on the output.
|
KV Connector: update the scheduler state based on the output.
|
||||||
|
|
||||||
The Worker side connectors add finished_recving and
|
The Worker side connectors add finished_recving and
|
||||||
finished_sending reqs to the output.
|
finished_sending reqs to the output.
|
||||||
@@ -1037,7 +1037,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
# if finished_recving: add to state so we can
|
# if finished_recving: add to state so we can
|
||||||
scheduler the request during the next step.
|
scheduler the request during the next step.
|
||||||
"""
|
"""
|
||||||
# P/D: update recv and send status from last step.
|
# KV Connector:: update recv and send status from last step.
|
||||||
for req_id in (model_runner_output.finished_recving or ()):
|
for req_id in (model_runner_output.finished_recving or ()):
|
||||||
logger.debug("Finished recving KV transfer for request %s", req_id)
|
logger.debug("Finished recving KV transfer for request %s", req_id)
|
||||||
self.finished_recving_kv_req_ids.add(req_id)
|
self.finished_recving_kv_req_ids.add(req_id)
|
||||||
|
|||||||
Reference in New Issue
Block a user