Correctly flush L2, as reconstructing the tensors on every iteration effectively put them in the L2, and gave the GPU enough idle time to avoid thermal throttling in a potentially unrealistic way.

The previous behaviour is potentially representative of some use cases (e.g. previous kernel filling L2 with the data in a very specific way) but not standard benchmarking practice.
2025-03-15 20:46:24 +00:00
parent e1c070fbef
commit 6cbff5778f
2 changed files with 35 additions and 11 deletions
--- a/deep_gemm/utils.py
+++ b/deep_gemm/utils.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import time
 import torch
 import torch.distributed as dist

@@ -77,10 +78,28 @@ class suppress_stdout_stderr:


 def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output: bool = False,
-                 trace_path: str = None, barrier_comm_profiling: bool = False, flush_l2: bool = False):
+                 trace_path: str = None, barrier_comm_profiling: bool = False, flush_l2: bool = True):
    # Conflict with Nsight Systems
    using_nsys = os.environ.get('DG_NSYS_PROFILING', False)

+    # By default, flush L2 with an excessive 8GB memset to give the GPU some (literal) chill time without full idle
+    # this avoid thermal throttling while keeping DVFS at max clocks (slight gain vs sleep / more consistent on GH200)
+    sleep_between_tests = 0.0
+    flush_l2_size = int(8e9 // 4)
+    if os.environ.get('DG_BENCH_DISABLE_L2_FLUSH', False):
+        flush_l2 = False
+    if os.environ.get('DG_BENCH_POWER_LIMITED', False):
+        # if we want to be thermally limited, we need to run many iterations non-stop for a fairly long time
+        # and spend as little time as possible doing memset and other setup work (80MiB should be enough to flush L2)
+        num_tests = 2000
+        flush_l2_size = int(80e6 // 4)
+    sleep_val = os.environ.get('DG_BENCH_SLEEP_BETWEEN_TESTS', False)
+    if sleep_val:
+        try:
+            sleep_between_tests = float(sleep_val)
+        except ValueError:
+            pass  # Keep default
+
    # For some auto-tuning kernels with prints
    fn()

@@ -98,8 +117,10 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output:
                    lhs @ rhs
                    dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
                for _ in range(num_tests):
+                    if sleep_between_tests > 0.0:
+                        time.sleep(sleep_between_tests)
                    if flush_l2:
-                        torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda').zero_()
+                        torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
                    fn()

                if not using_nsys: