Support FP8-E5M2 KV Cache (#2279)

Co-authored-by: zhaoyang <zhao.yang16@zte.com.cn> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-01-29 08:43:54 +08:00
parent 7d648418b8
commit 9090bf02e7
26 changed files with 912 additions and 196 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -17,6 +17,7 @@ class EngineArgs:
    download_dir: Optional[str] = None
    load_format: str = 'auto'
    dtype: str = 'auto'
+    kv_cache_dtype: str = 'auto'
    seed: int = 0
    max_model_len: Optional[int] = None
    worker_use_ray: bool = False
@@ -122,6 +123,14 @@ class EngineArgs:
            'The "auto" option will use FP16 precision '
            'for FP32 and FP16 models, and BF16 precision '
            'for BF16 models.')
+        parser.add_argument(
+            '--kv-cache-dtype',
+            type=str,
+            choices=['auto', 'fp8_e5m2'],
+            default='auto',
+            help='Data type for kv cache storage. If "auto", will use model '
+            'data type. Note FP8 is not supported when cuda version is '
+            'lower than 11.8.')
        parser.add_argument('--max-model-len',
                            type=int,
                            default=None,
@@ -269,7 +278,7 @@ class EngineArgs:
                                   self.max_context_len_to_capture)
        cache_config = CacheConfig(self.block_size,
                                   self.gpu_memory_utilization,
-                                   self.swap_space,
+                                   self.swap_space, self.kv_cache_dtype,
                                   model_config.get_sliding_window())
        parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                         self.tensor_parallel_size,