[Kernel] Tuned int8 Cutlass Kernels for SM75 (T4) (#6996)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-07-31 17:40:32 -04:00
parent 93548eb37e
commit 35e9c12bfa
3 changed files with 135 additions and 12 deletions
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -112,13 +112,20 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)

    timers = []
-    # pytorch impl
+    # pytorch impl - bfloat16
    timers.append(
        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))

+    # pytorch impl - float16
+    timers.append(
+        bench_fn(a.to(dtype=torch.float16, device="cuda"),
+                 b.to(dtype=torch.float16, device="cuda"), scale_a, scale_b,
+                 torch.float16, label, sub_label, pytorch_mm_impl,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales"))
+
    # cutlass impl
    timers.append(
        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,