Support FP8-E5M2 KV Cache (#2279)

Co-authored-by: zhaoyang <zhao.yang16@zte.com.cn> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-01-29 08:43:54 +08:00
parent 7d648418b8
commit 9090bf02e7
26 changed files with 912 additions and 196 deletions
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -85,6 +85,7 @@ class LLMEngine:
            f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
            f"quantization={model_config.quantization}, "
            f"enforce_eager={model_config.enforce_eager}, "
+            f"kv_cache_dtype={cache_config.cache_dtype}, "
            f"seed={model_config.seed})")
        # TODO(woosuk): Print more configs in debug mode.

@@ -144,6 +145,7 @@ class LLMEngine:
            rank=0,
            distributed_init_method=distributed_init_method,
            lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
            is_driver_worker=True,
        )
        self._run_workers("init_model")
@@ -234,6 +236,7 @@ class LLMEngine:
        model_config = copy.deepcopy(self.model_config)
        parallel_config = copy.deepcopy(self.parallel_config)
        scheduler_config = copy.deepcopy(self.scheduler_config)
+        cache_config = copy.deepcopy(self.cache_config)

        for rank, (worker, (node_id,
                            _)) in enumerate(zip(self.workers,
@@ -249,6 +252,7 @@ class LLMEngine:
                    rank,
                    distributed_init_method,
                    lora_config=self.lora_config,
+                    cache_config=cache_config,
                ))

        driver_rank = 0
@@ -261,6 +265,7 @@ class LLMEngine:
            driver_rank,
            distributed_init_method,
            lora_config=self.lora_config,
+            cache_config=cache_config,
            is_driver_worker=True,
        )

@@ -306,6 +311,7 @@ class LLMEngine:
            block_size=self.cache_config.block_size,
            gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
            cpu_swap_space=self.cache_config.swap_space_bytes,
+            cache_dtype=self.cache_config.cache_dtype,
        )

        # Since we use a shared centralized controller, we take the minimum