[V1] Prefix caching for vision language models (#11187)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2024-12-17 16:37:59 -08:00
parent c77eb8a33c
commit bf8717ebae
14 changed files with 341 additions and 97 deletions
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -60,9 +60,13 @@ class AsyncLLM(EngineClient):
        self.client_aborted_requests: List[str] = []

        # Processor (converts Inputs --> EngineCoreRequests).
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry)
+        self.processor = Processor(
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+            tokenizer=self.tokenizer,
+            input_registry=input_registry,
+        )

        # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
        self.detokenizer = Detokenizer(