Fix module path: move loader code from __init__.py to loader.py

quantize.py and others import from dsv4.kernels.cuda.loader — the module
must be a separate file, not just __init__.py.
This commit is contained in:
2026-06-01 21:18:29 +00:00
parent 230d28e562
commit 00746c2d2b
2 changed files with 77 additions and 75 deletions

View File

@@ -1,75 +1,2 @@
"""CUDA kernel loader with compile-once caching.
Compiles .cu kernels on first call, caches the loaded module for subsequent calls.
Eliminates the JIT recompilation overhead from torch.utils.cpp_extension.load
being called on every kernel invocation (was ~100ms per call, called ~500x per token).
Usage:
from dsv4.kernels.cuda.loader import get_cuda_module
mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
result = mod.fused_amax_quantize_nvfp4(x, divisor)
"""
import os
import hashlib
import torch
from torch.utils.cpp_extension import load
_KERNEL_DIR = os.path.dirname(os.path.abspath(__file__))
_CACHE_DIR = os.path.join(_KERNEL_DIR, "_build_cache")
_LOADED_MODULES = {}
def get_cuda_module(name, sources, extra_cuda_cflags=None):
"""Load a CUDA kernel module, compiling once and caching forever.
Args:
name: Module name (used for caching key).
sources: List of .cu filenames relative to the kernels/cuda/ directory.
extra_cuda_cflags: Optional list of extra CUDA compiler flags.
Returns:
The loaded Python module with the kernel functions.
"""
if name in _LOADED_MODULES:
return _LOADED_MODULES[name]
source_paths = [os.path.join(_KERNEL_DIR, s) for s in sources]
# Build a cache key from source file contents + compile flags
hasher = hashlib.md5()
for sp in source_paths:
hasher.update(open(sp, 'rb').read())
cflags = extra_cuda_cflags or []
for cf in cflags:
hasher.update(cf.encode())
cache_key = f"{name}_{hasher.hexdigest()}"
# Ensure cache directory exists
os.makedirs(_CACHE_DIR, exist_ok=True)
cflags = cflags or [
"-gencode=arch=compute_100a,code=sm_100a",
"-O3",
"--use_fast_math",
]
mod = load(
name=cache_key,
sources=source_paths,
extra_cuda_cflags=cflags,
build_directory=_CACHE_DIR,
verbose=False,
)
_LOADED_MODULES[name] = mod
return mod
def preload_all():
"""Preload all CUDA kernels at startup (before the hot path)."""
# Fused amax + quantize — THE critical kernel for P0
get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
# Standalone quantize (used by weight quantization, not hot path)
get_cuda_module("quantize_nvfp4", ["quantize_nvfp4.cu"])
# Sampler
get_cuda_module("sampler", ["sampler.cu"])
"""CUDA kernel loader — re-exports from loader.py for convenience."""
from dsv4.kernels.cuda.loader import get_cuda_module, preload_all

View File

@@ -0,0 +1,75 @@
"""CUDA kernel loader with compile-once caching.
Compiles .cu kernels on first call, caches the loaded module for subsequent calls.
Eliminates the JIT recompilation overhead from torch.utils.cpp_extension.load
being called on every kernel invocation (was ~100ms per call, called ~500x per token).
Usage:
from dsv4.kernels.cuda.loader import get_cuda_module
mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
result = mod.fused_amax_quantize_nvfp4(x, divisor)
"""
import os
import hashlib
import torch
from torch.utils.cpp_extension import load
_KERNEL_DIR = os.path.dirname(os.path.abspath(__file__))
_CACHE_DIR = os.path.join(_KERNEL_DIR, "_build_cache")
_LOADED_MODULES = {}
def get_cuda_module(name, sources, extra_cuda_cflags=None):
"""Load a CUDA kernel module, compiling once and caching forever.
Args:
name: Module name (used for caching key).
sources: List of .cu filenames relative to the kernels/cuda/ directory.
extra_cuda_cflags: Optional list of extra CUDA compiler flags.
Returns:
The loaded Python module with the kernel functions.
"""
if name in _LOADED_MODULES:
return _LOADED_MODULES[name]
source_paths = [os.path.join(_KERNEL_DIR, s) for s in sources]
# Build a cache key from source file contents + compile flags
hasher = hashlib.md5()
for sp in source_paths:
hasher.update(open(sp, 'rb').read())
cflags = extra_cuda_cflags or []
for cf in cflags:
hasher.update(cf.encode())
cache_key = f"{name}_{hasher.hexdigest()}"
# Ensure cache directory exists
os.makedirs(_CACHE_DIR, exist_ok=True)
cflags = cflags or [
"-gencode=arch=compute_100a,code=sm_100a",
"-O3",
"--use_fast_math",
]
mod = load(
name=cache_key,
sources=source_paths,
extra_cuda_cflags=cflags,
build_directory=_CACHE_DIR,
verbose=False,
)
_LOADED_MODULES[name] = mod
return mod
def preload_all():
"""Preload all CUDA kernels at startup (before the hot path)."""
# Fused amax + quantize — THE critical kernel for P0
get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
# Standalone quantize (used by weight quantization, not hot path)
get_cuda_module("quantize_nvfp4", ["quantize_nvfp4.cu"])
# Sampler
get_cuda_module("sampler", ["sampler.cu"])