[Core/Bugfix] Add FP8 K/V Scale and dtype conversion for prefix/prefill Triton Kernel (#7208)

Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-08-12 15:47:41 -07:00
parent 4ddc4743d7
commit a046f86397
10 changed files with 208 additions and 47 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -545,10 +545,6 @@ class CacheConfig:
            raise NotImplementedError(
                "Prefix caching is not supported with sliding window. "
                "Run with --disable-sliding-window to use prefix caching.")
-        if self.cache_dtype == "fp8":
-            raise NotImplementedError(
-                "Prefix caching is not supported for fp8 cache_dtype. "
-                "Run with --kv-cache-dtype auto to use prefix caching.")

    def verify_with_parallel_config(
        self,