deep_gemm/testing/bench.py

import os
import sys
import torch


def bench(fn, num_warmups: int = 5, num_tests: int = 10,
          high_precision: bool = False):
    # Flush L2 cache with 256 MB data
    torch.cuda.synchronize()
    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
    cache.zero_()

    # Warmup
    for _ in range(num_warmups):
        fn()

    # Add a large kernel to eliminate the CPU launch overhead
    if high_precision:
        x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
        y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
        x @ y

    # Testing
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
    start_event.record()
    for i in range(num_tests):
        fn()
    end_event.record()
    torch.cuda.synchronize()

    return start_event.elapsed_time(end_event) / num_tests / 1e3


class empty_suppress:
    def __enter__(self):
        return self

    def __exit__(self, *_):
        pass


class suppress_stdout_stderr:
    def __enter__(self):
        self.outnull_file = open(os.devnull, 'w')
        self.errnull_file = open(os.devnull, 'w')

        self.old_stdout_fileno_undup = sys.stdout.fileno()
        self.old_stderr_fileno_undup = sys.stderr.fileno()

        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
        self.old_stderr_fileno = os.dup(sys.stderr.fileno())

        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr

        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)

        sys.stdout = self.outnull_file
        sys.stderr = self.errnull_file
        return self

    def __exit__(self, *_):
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr

        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)

        os.close(self.old_stdout_fileno)
        os.close(self.old_stderr_fileno)

        self.outnull_file.close()
        self.errnull_file.close()


def bench_kineto(fn, kernel_names, num_tests: int = 30,
                 suppress_kineto_output: bool = False,
                 trace_path: str = None, flush_l2: bool = True,
                 with_multiple_kernels: bool = False):
    # Conflict with Nsight Systems
    using_nsys = int(os.environ.get('DG_NSYS_PROFILING', 0))

    # By default, flush L2 with an excessive 8GB memset to give the GPU some (literal) chill time without full idle
    flush_l2_size = int(8e9 // 4)

    # For some auto-tuning kernels with prints
    fn()

    # Profile
    suppress = suppress_stdout_stderr if suppress_kineto_output and not using_nsys else empty_suppress
    with suppress():
        schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1) if not using_nsys else None
        profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) if not using_nsys else empty_suppress()
        with profiler:
            for i in range(2):
                for _ in range(num_tests):
                    if flush_l2:
                        torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
                    fn()

                if not using_nsys:
                    profiler.step()

    # Return 1 if using Nsight Systems
    if using_nsys:
        return 1

    # Parse the profiling table
    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
    is_tuple = isinstance(kernel_names, tuple)
    prof_lines = profiler.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
    assert all([isinstance(name, str) for name in kernel_names])
    if not with_multiple_kernels:
        for name in kernel_names:
            assert sum([name in line for line in prof_lines]) == 1, f'Errors of the kernel {name} in the profiling table'

    # Save chrome traces
    if trace_path is not None:
        profiler.export_chrome_trace(trace_path)

    # Return average kernel times
    units = {'ms': 1e3, 'us': 1e6}
    kernel_times = []
    for name in kernel_names:
        total_time = 0
        total_num = 0
        for line in prof_lines:
            if name in line:
                time_str = line.split()[-2]
                num_str = line.split()[-1]
                for unit, scale in units.items():
                    if unit in time_str:
                        total_time += float(time_str.replace(unit, '')) / scale * int(num_str)
                        total_num += int(num_str)
                        break
        kernel_times.append(total_time / total_num)

    return tuple(kernel_times) if is_tuple else kernel_times[0]
Initial commit 2025-02-25 22:52:41 +08:00			`import os`
			`import sys`
			`import torch`


			`def bench(fn, num_warmups: int = 5, num_tests: int = 10,`
			`high_precision: bool = False):`
			`# Flush L2 cache with 256 MB data`
			`torch.cuda.synchronize()`
			`cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')`
			`cache.zero_()`

			`# Warmup`
			`for _ in range(num_warmups):`
			`fn()`

			`# Add a large kernel to eliminate the CPU launch overhead`
			`if high_precision:`
			`x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')`
			`y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')`
			`x @ y`

			`# Testing`
			`start_event = torch.cuda.Event(enable_timing=True)`
			`end_event = torch.cuda.Event(enable_timing=True)`
			`start_event.record()`
			`for i in range(num_tests):`
			`fn()`
			`end_event.record()`
			`torch.cuda.synchronize()`

Add more GPU architectures support (#112) * Add more GPU architectures support * Update layout.py * Optimize performance, Add SM90 support, Add 1D2D SM100 support * Add fmtlib submodule at commit 553ec11 --------- Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> 2025-07-18 11:32:22 +08:00			`return start_event.elapsed_time(end_event) / num_tests / 1e3`
Initial commit 2025-02-25 22:52:41 +08:00

			`class empty_suppress:`
			`def __enter__(self):`
			`return self`

			`def __exit__(self, *_):`
			`pass`


			`class suppress_stdout_stderr:`
			`def __enter__(self):`
			`self.outnull_file = open(os.devnull, 'w')`
			`self.errnull_file = open(os.devnull, 'w')`

			`self.old_stdout_fileno_undup = sys.stdout.fileno()`
			`self.old_stderr_fileno_undup = sys.stderr.fileno()`

			`self.old_stdout_fileno = os.dup(sys.stdout.fileno())`
			`self.old_stderr_fileno = os.dup(sys.stderr.fileno())`

			`self.old_stdout = sys.stdout`
			`self.old_stderr = sys.stderr`

			`os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)`
			`os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)`

			`sys.stdout = self.outnull_file`
			`sys.stderr = self.errnull_file`
			`return self`

			`def __exit__(self, *_):`
			`sys.stdout = self.old_stdout`
			`sys.stderr = self.old_stderr`

			`os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)`
			`os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)`

			`os.close(self.old_stdout_fileno)`
			`os.close(self.old_stderr_fileno)`

			`self.outnull_file.close()`
			`self.errnull_file.close()`


Add more GPU architectures support (#112) * Add more GPU architectures support * Update layout.py * Optimize performance, Add SM90 support, Add 1D2D SM100 support * Add fmtlib submodule at commit 553ec11 --------- Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> 2025-07-18 11:32:22 +08:00			`def bench_kineto(fn, kernel_names, num_tests: int = 30,`
			`suppress_kineto_output: bool = False,`
			`trace_path: str = None, flush_l2: bool = True,`
Weight gradient kernels for dense and MoE models (#95) * Init weight gradient kernels. * Support unaligned n,k and gmem stride * Update docs * Several cleanups * Remove restrictions on N * Add stride(0) assertions --------- Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com> 2025-05-14 14:47:58 +08:00			`with_multiple_kernels: bool = False):`
Initial commit 2025-02-25 22:52:41 +08:00			`# Conflict with Nsight Systems`
Refactor JIT compilation (+NVRTC support) (#94) * [wip] refactor: compile to .cubin Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * refactor: compile to .cubin and add NVRTC option Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * fix: compiler version Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: compat for old drivers Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: save kernel name to file Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: fix win compat Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * fix: windows compat Signed-off-by: Gabriel Wu <13583761+lucifer1004@users.noreply.github.com> * feat: make API more general Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: drop support for CUDA<12.3 Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * doc: update README Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * Some lints and refactor * Refactor runtime * Several fixes * Refactor environment variables * Code format * Add a TODO * Compatible with CUDA 12.3 * Fix indent * Fix typing * Drop support for Windows * Add a TODO --------- Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> Signed-off-by: Gabriel Wu <13583761+lucifer1004@users.noreply.github.com> Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com> 2025-05-07 11:38:14 +08:00			`using_nsys = int(os.environ.get('DG_NSYS_PROFILING', 0))`
Initial commit 2025-02-25 22:52:41 +08:00
Correctly flush L2, as reconstructing the tensors on every iteration effectively put them in the L2, and gave the GPU enough idle time to avoid thermal throttling in a potentially unrealistic way. The previous behaviour is potentially representative of some use cases (e.g. previous kernel filling L2 with the data in a very specific way) but not standard benchmarking practice. 2025-03-15 20:46:24 +00:00			`# By default, flush L2 with an excessive 8GB memset to give the GPU some (literal) chill time without full idle`
			`flush_l2_size = int(8e9 // 4)`

Initial commit 2025-02-25 22:52:41 +08:00			`# For some auto-tuning kernels with prints`
			`fn()`

			`# Profile`
			`suppress = suppress_stdout_stderr if suppress_kineto_output and not using_nsys else empty_suppress`
			`with suppress():`
			`schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1) if not using_nsys else None`
			`profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) if not using_nsys else empty_suppress()`
			`with profiler:`
			`for i in range(2):`
			`for _ in range(num_tests):`
			`if flush_l2:`
Correctly flush L2, as reconstructing the tensors on every iteration effectively put them in the L2, and gave the GPU enough idle time to avoid thermal throttling in a potentially unrealistic way. The previous behaviour is potentially representative of some use cases (e.g. previous kernel filling L2 with the data in a very specific way) but not standard benchmarking practice. 2025-03-15 20:46:24 +00:00			`torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()`
Initial commit 2025-02-25 22:52:41 +08:00			`fn()`

			`if not using_nsys:`
			`profiler.step()`

			`# Return 1 if using Nsight Systems`
			`if using_nsys:`
			`return 1`

			`# Parse the profiling table`
			`assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)`
Add more GPU architectures support (#112) * Add more GPU architectures support * Update layout.py * Optimize performance, Add SM90 support, Add 1D2D SM100 support * Add fmtlib submodule at commit 553ec11 --------- Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> 2025-07-18 11:32:22 +08:00			`is_tuple = isinstance(kernel_names, tuple)`
Initial commit 2025-02-25 22:52:41 +08:00			`prof_lines = profiler.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')`
			`kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names`
			`assert all([isinstance(name, str) for name in kernel_names])`
Weight gradient kernels for dense and MoE models (#95) * Init weight gradient kernels. * Support unaligned n,k and gmem stride * Update docs * Several cleanups * Remove restrictions on N * Add stride(0) assertions --------- Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com> 2025-05-14 14:47:58 +08:00			`if not with_multiple_kernels:`
			`for name in kernel_names:`
			`assert sum([name in line for line in prof_lines]) == 1, f'Errors of the kernel {name} in the profiling table'`
Initial commit 2025-02-25 22:52:41 +08:00
			`# Save chrome traces`
			`if trace_path is not None:`
			`profiler.export_chrome_trace(trace_path)`

			`# Return average kernel times`
			`units = {'ms': 1e3, 'us': 1e6}`
			`kernel_times = []`
			`for name in kernel_names:`
Weight gradient kernels for dense and MoE models (#95) * Init weight gradient kernels. * Support unaligned n,k and gmem stride * Update docs * Several cleanups * Remove restrictions on N * Add stride(0) assertions --------- Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com> 2025-05-14 14:47:58 +08:00			`total_time = 0`
			`total_num = 0`
Initial commit 2025-02-25 22:52:41 +08:00			`for line in prof_lines:`
			`if name in line:`
			`time_str = line.split()[-2]`
Weight gradient kernels for dense and MoE models (#95) * Init weight gradient kernels. * Support unaligned n,k and gmem stride * Update docs * Several cleanups * Remove restrictions on N * Add stride(0) assertions --------- Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com> 2025-05-14 14:47:58 +08:00			`num_str = line.split()[-1]`
Initial commit 2025-02-25 22:52:41 +08:00			`for unit, scale in units.items():`
			`if unit in time_str:`
Weight gradient kernels for dense and MoE models (#95) * Init weight gradient kernels. * Support unaligned n,k and gmem stride * Update docs * Several cleanups * Remove restrictions on N * Add stride(0) assertions --------- Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com> 2025-05-14 14:47:58 +08:00			`total_time += float(time_str.replace(unit, '')) / scale * int(num_str)`
			`total_num += int(num_str)`
Initial commit 2025-02-25 22:52:41 +08:00			`break`
Weight gradient kernels for dense and MoE models (#95) * Init weight gradient kernels. * Support unaligned n,k and gmem stride * Update docs * Several cleanups * Remove restrictions on N * Add stride(0) assertions --------- Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com> 2025-05-14 14:47:58 +08:00			`kernel_times.append(total_time / total_num)`

Add more GPU architectures support (#112) * Add more GPU architectures support * Update layout.py * Optimize performance, Add SM90 support, Add 1D2D SM100 support * Add fmtlib submodule at commit 553ec11 --------- Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> 2025-07-18 11:32:22 +08:00			`return tuple(kernel_times) if is_tuple else kernel_times[0]`