Enable scaled FP8 (e4m3fn) KV cache on ROCm (AMD GPU) (#3290)

Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: HaiShaw <hixiao@gmail.com> Co-authored-by: AdrianAbeyta <Adrian.Abeyta@amd.com> Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com> Co-authored-by: root <root@gt-pla-u18-08.pla.dcgpu> Co-authored-by: mawong-amd <156021403+mawong-amd@users.noreply.github.com> Co-authored-by: ttbachyinsda <ttbachyinsda@outlook.com> Co-authored-by: guofangze <guofangze@kuaishou.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: jacobthebanana <50071502+jacobthebanana@users.noreply.github.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-03 16:15:55 -05:00
parent 3dcb3e8b98
commit 2ff767b513
41 changed files with 2592 additions and 142 deletions
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -24,6 +24,7 @@ def main(args: argparse.Namespace):
              dtype=args.dtype,
              enforce_eager=args.enforce_eager,
              kv_cache_dtype=args.kv_cache_dtype,
+              quantization_param_path=args.quantization_param_path,
              device=args.device,
              ray_workers_use_nsight=args.ray_workers_use_nsight,
              enable_chunked_prefill=args.enable_chunked_prefill,
@@ -127,10 +128,23 @@ if __name__ == '__main__':
    parser.add_argument(
        "--kv-cache-dtype",
        type=str,
-        choices=['auto', 'fp8_e5m2'],
+        choices=['auto', 'fp8'],
        default='auto',
        help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
    parser.add_argument(
        '--profile',
        action='store_true',
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -72,6 +72,7 @@ def run_vllm(
    max_model_len: Optional[int],
    enforce_eager: bool,
    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
    device: str,
    enable_prefix_caching: bool,
    gpu_memory_utilization: float = 0.9,
@@ -89,6 +90,7 @@ def run_vllm(
              gpu_memory_utilization=gpu_memory_utilization,
              enforce_eager=enforce_eager,
              kv_cache_dtype=kv_cache_dtype,
+              quantization_param_path=quantization_param_path,
              device=device,
              enable_prefix_caching=enable_prefix_caching,
              download_dir=download_dir)
@@ -217,7 +219,8 @@ def main(args: argparse.Namespace):
                                args.seed, args.n, args.use_beam_search,
                                args.trust_remote_code, args.dtype,
                                args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device,
+                                args.kv_cache_dtype,
+                                args.quantization_param_path, args.device,
                                args.enable_prefix_caching,
                                args.gpu_memory_utilization, args.download_dir)
    elif args.backend == "hf":
@@ -306,10 +309,23 @@ if __name__ == "__main__":
    parser.add_argument(
        "--kv-cache-dtype",
        type=str,
-        choices=["auto", "fp8_e5m2"],
+        choices=["auto", "fp8"],
        default="auto",
        help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
    parser.add_argument(
        "--device",
        type=str,
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -97,6 +97,9 @@ def main(
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()

+        # Using default kv_scale
+        kv_scale = 1.0
+
        for _ in range(num_iters):
            if version == "v1":
                ops.paged_attention_v1(
@@ -112,6 +115,7 @@ def main(
                    max_context_len,
                    alibi_slopes,
                    kv_cache_dtype,
+                    kv_scale,
                )
            elif version == "v2":
                ops.paged_attention_v2(
@@ -130,6 +134,7 @@ def main(
                    max_context_len,
                    alibi_slopes,
                    kv_cache_dtype,
+                    kv_scale,
                )
            else:
                raise ValueError(f"Invalid version: {version}")
@@ -179,11 +184,13 @@ if __name__ == '__main__':
    parser.add_argument(
        "--kv-cache-dtype",
        type=str,
-        choices=["auto", "fp8_e5m2"],
+        choices=["auto", "fp8"],
        default="auto",
        help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
-    parser.add_argument("--device", type=str, choices=["cuda"], default="cuda")
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
    args = parser.parse_args()
    print(args)