[Frontend][V1] Online serving performance improvements (#12287)

This commit is contained in:
Nick Hill
2025-01-22 14:22:12 -08:00
committed by GitHub
parent 7206ce4ce1
commit aea94362c9
7 changed files with 100 additions and 44 deletions

View File

@@ -64,6 +64,12 @@ class Request:
# recomputing.
self._kv_block_hashes: List[BlockHashType] = []
# Read-only views
# Prevent directly appending to the these lists since
# they should also be updated simultaneously.
self.output_token_ids = ConstantList(self._output_token_ids)
self.all_token_ids = ConstantList(self._all_token_ids)
@classmethod
def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
return cls(
@@ -79,18 +85,6 @@ class Request:
lora_request=request.lora_request,
)
@property
def output_token_ids(self) -> ConstantList[int]:
# Prevent directly appending to the output_token_ids since
# all_token_ids should also be updated simultaneously.
return ConstantList(self._output_token_ids)
@property
def all_token_ids(self) -> ConstantList[int]:
# Prevent directly appending to the all_token_ids since
# output_token_ids should also be updated simultaneously
return ConstantList(self._all_token_ids)
def append_output_token_ids(
self,
token_ids: Union[int, List[int]],