[Frontend][V1] Online serving performance improvements (#12287)

2025-01-22 14:22:12 -08:00
parent 7206ce4ce1
commit aea94362c9
7 changed files with 100 additions and 44 deletions
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -64,6 +64,12 @@ class Request:
        # recomputing.
        self._kv_block_hashes: List[BlockHashType] = []

+        # Read-only views
+        # Prevent directly appending to the these lists since
+        # they should also be updated simultaneously.
+        self.output_token_ids = ConstantList(self._output_token_ids)
+        self.all_token_ids = ConstantList(self._all_token_ids)
+
    @classmethod
    def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
        return cls(
@@ -79,18 +85,6 @@ class Request:
            lora_request=request.lora_request,
        )

-    @property
-    def output_token_ids(self) -> ConstantList[int]:
-        # Prevent directly appending to the output_token_ids since
-        # all_token_ids should also be updated simultaneously.
-        return ConstantList(self._output_token_ids)
-
-    @property
-    def all_token_ids(self) -> ConstantList[int]:
-        # Prevent directly appending to the all_token_ids since
-        # output_token_ids should also be updated simultaneously
-        return ConstantList(self._all_token_ids)
-
    def append_output_token_ids(
        self,
        token_ids: Union[int, List[int]],