Fix module path: move loader code from __init__.py to loader.py
quantize.py and others import from dsv4.kernels.cuda.loader — the module must be a separate file, not just __init__.py.
This commit is contained in:
@@ -1,75 +1,2 @@
|
||||
"""CUDA kernel loader with compile-once caching.
|
||||
|
||||
Compiles .cu kernels on first call, caches the loaded module for subsequent calls.
|
||||
Eliminates the JIT recompilation overhead from torch.utils.cpp_extension.load
|
||||
being called on every kernel invocation (was ~100ms per call, called ~500x per token).
|
||||
|
||||
Usage:
|
||||
from dsv4.kernels.cuda.loader import get_cuda_module
|
||||
mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
|
||||
result = mod.fused_amax_quantize_nvfp4(x, divisor)
|
||||
"""
|
||||
import os
|
||||
import hashlib
|
||||
import torch
|
||||
from torch.utils.cpp_extension import load
|
||||
|
||||
_KERNEL_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
_CACHE_DIR = os.path.join(_KERNEL_DIR, "_build_cache")
|
||||
_LOADED_MODULES = {}
|
||||
|
||||
|
||||
def get_cuda_module(name, sources, extra_cuda_cflags=None):
|
||||
"""Load a CUDA kernel module, compiling once and caching forever.
|
||||
|
||||
Args:
|
||||
name: Module name (used for caching key).
|
||||
sources: List of .cu filenames relative to the kernels/cuda/ directory.
|
||||
extra_cuda_cflags: Optional list of extra CUDA compiler flags.
|
||||
|
||||
Returns:
|
||||
The loaded Python module with the kernel functions.
|
||||
"""
|
||||
if name in _LOADED_MODULES:
|
||||
return _LOADED_MODULES[name]
|
||||
|
||||
source_paths = [os.path.join(_KERNEL_DIR, s) for s in sources]
|
||||
|
||||
# Build a cache key from source file contents + compile flags
|
||||
hasher = hashlib.md5()
|
||||
for sp in source_paths:
|
||||
hasher.update(open(sp, 'rb').read())
|
||||
cflags = extra_cuda_cflags or []
|
||||
for cf in cflags:
|
||||
hasher.update(cf.encode())
|
||||
cache_key = f"{name}_{hasher.hexdigest()}"
|
||||
|
||||
# Ensure cache directory exists
|
||||
os.makedirs(_CACHE_DIR, exist_ok=True)
|
||||
|
||||
cflags = cflags or [
|
||||
"-gencode=arch=compute_100a,code=sm_100a",
|
||||
"-O3",
|
||||
"--use_fast_math",
|
||||
]
|
||||
|
||||
mod = load(
|
||||
name=cache_key,
|
||||
sources=source_paths,
|
||||
extra_cuda_cflags=cflags,
|
||||
build_directory=_CACHE_DIR,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
_LOADED_MODULES[name] = mod
|
||||
return mod
|
||||
|
||||
|
||||
def preload_all():
|
||||
"""Preload all CUDA kernels at startup (before the hot path)."""
|
||||
# Fused amax + quantize — THE critical kernel for P0
|
||||
get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
|
||||
# Standalone quantize (used by weight quantization, not hot path)
|
||||
get_cuda_module("quantize_nvfp4", ["quantize_nvfp4.cu"])
|
||||
# Sampler
|
||||
get_cuda_module("sampler", ["sampler.cu"])
|
||||
"""CUDA kernel loader — re-exports from loader.py for convenience."""
|
||||
from dsv4.kernels.cuda.loader import get_cuda_module, preload_all
|
||||
|
||||
75
dsv4/kernels/cuda/loader.py
Normal file
75
dsv4/kernels/cuda/loader.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""CUDA kernel loader with compile-once caching.
|
||||
|
||||
Compiles .cu kernels on first call, caches the loaded module for subsequent calls.
|
||||
Eliminates the JIT recompilation overhead from torch.utils.cpp_extension.load
|
||||
being called on every kernel invocation (was ~100ms per call, called ~500x per token).
|
||||
|
||||
Usage:
|
||||
from dsv4.kernels.cuda.loader import get_cuda_module
|
||||
mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
|
||||
result = mod.fused_amax_quantize_nvfp4(x, divisor)
|
||||
"""
|
||||
import os
|
||||
import hashlib
|
||||
import torch
|
||||
from torch.utils.cpp_extension import load
|
||||
|
||||
_KERNEL_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
_CACHE_DIR = os.path.join(_KERNEL_DIR, "_build_cache")
|
||||
_LOADED_MODULES = {}
|
||||
|
||||
|
||||
def get_cuda_module(name, sources, extra_cuda_cflags=None):
|
||||
"""Load a CUDA kernel module, compiling once and caching forever.
|
||||
|
||||
Args:
|
||||
name: Module name (used for caching key).
|
||||
sources: List of .cu filenames relative to the kernels/cuda/ directory.
|
||||
extra_cuda_cflags: Optional list of extra CUDA compiler flags.
|
||||
|
||||
Returns:
|
||||
The loaded Python module with the kernel functions.
|
||||
"""
|
||||
if name in _LOADED_MODULES:
|
||||
return _LOADED_MODULES[name]
|
||||
|
||||
source_paths = [os.path.join(_KERNEL_DIR, s) for s in sources]
|
||||
|
||||
# Build a cache key from source file contents + compile flags
|
||||
hasher = hashlib.md5()
|
||||
for sp in source_paths:
|
||||
hasher.update(open(sp, 'rb').read())
|
||||
cflags = extra_cuda_cflags or []
|
||||
for cf in cflags:
|
||||
hasher.update(cf.encode())
|
||||
cache_key = f"{name}_{hasher.hexdigest()}"
|
||||
|
||||
# Ensure cache directory exists
|
||||
os.makedirs(_CACHE_DIR, exist_ok=True)
|
||||
|
||||
cflags = cflags or [
|
||||
"-gencode=arch=compute_100a,code=sm_100a",
|
||||
"-O3",
|
||||
"--use_fast_math",
|
||||
]
|
||||
|
||||
mod = load(
|
||||
name=cache_key,
|
||||
sources=source_paths,
|
||||
extra_cuda_cflags=cflags,
|
||||
build_directory=_CACHE_DIR,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
_LOADED_MODULES[name] = mod
|
||||
return mod
|
||||
|
||||
|
||||
def preload_all():
|
||||
"""Preload all CUDA kernels at startup (before the hot path)."""
|
||||
# Fused amax + quantize — THE critical kernel for P0
|
||||
get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
|
||||
# Standalone quantize (used by weight quantization, not hot path)
|
||||
get_cuda_module("quantize_nvfp4", ["quantize_nvfp4.cu"])
|
||||
# Sampler
|
||||
get_cuda_module("sampler", ["sampler.cu"])
|
||||
Reference in New Issue
Block a user