[Core/Bugfix] Add FP8 K/V Scale and dtype conversion for prefix/prefill Triton Kernel (#7208)

Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
This commit is contained in:
jon-chuang
2024-08-12 15:47:41 -07:00
committed by GitHub
parent 4ddc4743d7
commit a046f86397
10 changed files with 208 additions and 47 deletions

View File

@@ -545,10 +545,6 @@ class CacheConfig:
raise NotImplementedError(
"Prefix caching is not supported with sliding window. "
"Run with --disable-sliding-window to use prefix caching.")
if self.cache_dtype == "fp8":
raise NotImplementedError(
"Prefix caching is not supported for fp8 cache_dtype. "
"Run with --kv-cache-dtype auto to use prefix caching.")
def verify_with_parallel_config(
self,