dsv4/ops/quantize.py

"""NVFP4 quantization: BF16 <-> NVFP4 conversion, scale factor computation."""
import math
import torch
import cutlass
import cutlass.cute as cute
import cutlass.torch as cutlass_torch
import cutlass.utils as utils
from dsv4.ops.layouts import ceil_div

from dsv4.kernels.gemm.grouped import (
    cat_byte_reinterpretable_tensors,
    stack_byte_reinterpretable_tensors,
)

E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]

# Cache compiled kernels + pre-allocated workspace by cache_key
# Each entry: {'compiled': callable, 'workspace': Tensor, 'workspace_size': int}
#
# Key design decisions (Bug #1 fix):
# - cute.compile does NOT corrupt GPU memory (verified 2026-05-20 on B200).
#   The original _needs_token_refill hack was a misdiagnosis. The real bug
#   was elsewhere (likely OOB write or weight loading).
# - Workspace is pre-allocated per cache entry during warmup_compilation()
#   and reused on subsequent calls. No torch.full() in the hot path.
# - CuTe tensor wrappers (from_dlpack + mark_layout_dynamic) are cheap
#   metadata wrappers. We re-create them per call from real tensors.
#   Caching them would hold stale references to tensors that get freed.

# Cached LUT for E2M1 quantization (created once per device, cudagraph-safe)
_NVFP4_STEP_LUT_CACHE = {}
def _get_step_to_idx_lut(device):
    """Get or create the E2M1 step-to-index LUT for the given device.
    
    Cached per device to avoid CPU->CUDA copies during cudagraph capture.
    Must be pre-populated during warmup (before torch.compile/cudagraph capture)
    so the lock is never entered on the compiled path.
    """
    # Fast path: already cached — no lock needed (torch.compile-safe)
    if device in _NVFP4_STEP_LUT_CACHE:
        return _NVFP4_STEP_LUT_CACHE[device]
    # Slow path: first call, create the LUT
    lut = torch.as_tensor(
        [0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7],
        dtype=torch.int8, device=device,
    )
    _NVFP4_STEP_LUT_CACHE[device] = lut
    return lut
SF_VEC_SIZE = 16  # NVFP4 block size

def quantize_to_nvfp4(x_bf16, block_size=SF_VEC_SIZE):
    """Quantize BF16 tensor to NVFP4.
    
    Args:
        x_bf16: (..., D) BF16 tensor
    
    Returns:
        x_fp4: (..., D//2) float4_e2m1fn_x2 — native PyTorch FP4
        x_sf: (..., D//16) float8_e4m3fn — block scales
        global_scale: float32 scalar
    """
    x_f32 = x_bf16.float()
    amax = x_f32.abs().max().clamp(min=1e-8).float()
    global_scale = amax / (6.0 * 448.0)
    x_norm = x_f32 / global_scale

    last_dim = x_norm.shape[-1]
    n_blocks = ceil_div(last_dim, block_size)

    if last_dim % block_size != 0:
        pad_size = n_blocks * block_size - last_dim
        x_norm = torch.nn.functional.pad(x_norm, (0, pad_size))

    x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size)
    block_amax = x_reshaped.abs().amax(dim=-1)
    # Detect zero blocks and underflow blocks (amax > 0 but too small for FP8).
    # Smallest positive FP8 e4m3fn is 2^-9 ≈ 1.95e-3. If amax/6 < this,
    # the block scale underflows to 0, and dividing x by the clamped 1e-8
    # inflates values into nonzero FP4 buckets — producing wrong results.
    zero_block = block_amax < (6.0 * 2.0 ** -9)  # < ~0.0117
    # Zero out x for zero/underflow blocks before division.
    # This ensures x_scaled = 0 → FP4 nibbles = 0.
    # Use scalar 0.0 instead of torch.zeros_like — no allocation, graph-safe.
    x_reshaped = torch.where(zero_block.unsqueeze(-1), 0.0, x_reshaped)
    block_amax = block_amax.clamp(min=1e-8)
    block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn)
    # Force zero/underflow blocks: FP8 scale = 0 (exact zero).
    block_scale = torch.where(zero_block, 0.0, block_scale)

    # Nearest E2M1
    block_sf_expanded = block_scale.float().unsqueeze(-1)
    x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8)

    signs = torch.sign(x_scaled)
    abs_scaled = x_scaled.abs().clamp(max=6.0)
    
    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
    step_to_idx = _get_step_to_idx_lut(x_bf16.device)
    indices = step_to_idx[half_steps.long()]

    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
    even = nibbles[..., ::2]
    odd = nibbles[..., 1::2]
    packed = (odd << 4) | even

    packed_shape = list(x_bf16.shape)
    packed_shape[-1] = last_dim // 2
    x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape)

    sf_shape = list(x_bf16.shape[:-1]) + [n_blocks]
    block_scale = block_scale.reshape(sf_shape)

    return x_fp4, block_scale, global_scale


def quantize_activation_nvfp4(x_bf16, global_scale, block_size=SF_VEC_SIZE):
    """Quantize BF16 activation tensor to NVFP4 (cudagraph-safe).

    Unlike quantize_to_nvfp4(), this takes a pre-computed global_scale
    instead of computing it via .max() (which forces CPU-GPU sync).
    All operations are pure GPU with no CPU-GPU syncs.

    Args:
        x_bf16: (..., D) BF16 tensor
        global_scale: float32 scalar (pre-computed, NOT from .max())
        block_size: NVFP4 block size
    
    Returns:
        x_fp4: (..., D//2) float4_e2m1fn_x2
        x_sf: (..., D//16) float8_e4m3fn
    """
    x_f32 = x_bf16.float()
    x_norm = x_f32 / global_scale

    last_dim = x_norm.shape[-1]
    n_blocks = ceil_div(last_dim, block_size)

    if last_dim % block_size != 0:
        pad_size = n_blocks * block_size - last_dim
        x_norm = torch.nn.functional.pad(x_norm, (0, pad_size))

    x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size)
    block_amax = x_reshaped.abs().amax(dim=-1)
    # Detect zero blocks and underflow blocks (same threshold as quantize_to_nvfp4).
    zero_block = block_amax < (6.0 * 2.0 ** -9)
    x_reshaped = torch.where(zero_block.unsqueeze(-1), 0.0, x_reshaped)
    block_amax = block_amax.clamp(min=1e-8, max=6.0 * 448.0)  # E4M3 max = 448
    block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn)
    block_scale = torch.where(zero_block, 0.0, block_scale)

    block_sf_expanded = block_scale.float().unsqueeze(-1)
    x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8)
    signs = torch.sign(x_scaled)
    abs_scaled = x_scaled.abs().clamp(max=6.0)
    
    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
    step_to_idx = _get_step_to_idx_lut(x_bf16.device)
    indices = step_to_idx[half_steps.long()]

    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
    even = nibbles[..., ::2]
    odd = nibbles[..., 1::2]
    packed = (odd << 4) | even

    packed_shape = list(x_bf16.shape)
    packed_shape[-1] = last_dim // 2
    x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape)

    sf_shape = list(x_bf16.shape[:-1]) + [n_blocks]
    block_scale = block_scale.reshape(sf_shape)

    return x_fp4, block_scale


def quantize_weight_to_nvfp4(w_bf16, block_size=SF_VEC_SIZE):
    """Quantize BF16 weight matrix to NVFP4.
    
    The weight is (K, N) where K is the input dim (packed dimension).
    Block scales are computed along K (dim 0).
    
    Args:
        w_bf16: (K, N) BF16 weight matrix
    
    Returns:
        w_fp4: (K//2, N) float4_e2m1fn_x2 — K is the packed dim
        w_sf: (K//16, N) float8_e4m3fn — block scales along K
        global_scale: float32 scalar
    """
    K, N = w_bf16.shape
    w_f32 = w_bf16.float()
    amax = w_f32.abs().max().clamp(min=1e-8).float()
    global_scale = amax / (6.0 * 448.0)
    w_norm = w_f32 / global_scale

    k_blocks = ceil_div(K, block_size)
    if K % block_size != 0:
        w_norm = torch.nn.functional.pad(w_norm, (0, 0, 0, k_blocks * block_size - K))

    w_reshaped = w_norm.reshape(k_blocks, block_size, N)
    w_block_amax = w_reshaped.abs().amax(dim=1)
    # Detect zero blocks and underflow blocks (same threshold).
    zero_block = w_block_amax < (6.0 * 2.0 ** -9)
    w_reshaped = torch.where(zero_block.unsqueeze(1),
                              torch.zeros_like(w_reshaped), w_reshaped)
    w_block_amax = w_block_amax.clamp(min=1e-8)
    w_sf = (w_block_amax / 6.0).to(torch.float8_e4m3fn)
    w_sf = torch.where(zero_block, torch.zeros_like(w_sf), w_sf)

    w_block_sf = w_sf.float().unsqueeze(1)
    w_scaled = w_reshaped / w_block_sf.clamp(min=1e-8)

    signs = torch.sign(w_scaled)
    abs_scaled = w_scaled.abs().clamp(max=6.0)
    
    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
    step_to_idx = _get_step_to_idx_lut(w_bf16.device)
    indices = step_to_idx[half_steps.long()]
    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)

    even = nibbles[:, ::2, :]
    odd = nibbles[:, 1::2, :]
    packed = (odd << 4) | even

    w_fp4 = packed.reshape(K // 2, N).view(torch.float4_e2m1fn_x2)
    return w_fp4, w_sf, global_scale


# ── Scale Factor Assembly ─────────────────────────────────────────────
def deinterleave_quantize_nvfp4_cuda(fused_bf16, intermediate, global_scale, granularity=8):
    """De-interleave + quantize fused SwiGLU output using a custom CUDA kernel.
    
    Single kernel launch, no Python loop. 4x faster than the Python path.
    
    Args:
        fused_bf16: (M, 2*intermediate) BF16 — fused L1 output with interleaved gate/up
        intermediate: intermediate dimension (e.g., 3072)
        global_scale: pre-computed global scale for quantization
        granularity: interleave granularity in BF16 columns (default 8)
    
    Returns:
        x_fp4: (M, intermediate//2) float4_e2m1fn_x2 — quantized SwiGLU
        x_sf: (M, intermediate//16) float8_e4m3fn — block scales
    """
    from dsv4.kernels.cuda.loader import get_cuda_module
    mod = get_cuda_module("deinterleave_quantize_nvfp4", ["deinterleave_quantize.cu"])
    return mod.deinterleave_quantize_nvfp4(fused_bf16, intermediate, granularity, global_scale)


def deinterleave_amax_quantize_nvfp4_fused(fused_bf16, intermediate, divisor=6.0 * 448.0, granularity=8):
    """Fused deinterleave + amax + quantize: zero CPU syncs, two kernel launches.
    
    For the MoE fused_swiglu L2 path. Two-kernel approach (correct):
      Kernel 1: compute_amax_gsa on the de-interleaved values (GPU-only)
      Kernel 2: deinterleave_quantize_from_buffer using gsa from GPU buffer
    
    Args:
        fused_bf16: (M, 2*intermediate) BF16 — fused L1 output
        intermediate: intermediate dimension
        divisor: gsa = amax / divisor. Default 2688.0.
        granularity: interleave granularity (default 8)
    
    Returns:
        x_fp4: (M, intermediate//2) float4_e2m1fn_x2
        x_sf: (M, intermediate//16) float8_e4m3fn
        gsa: (M,) float32 GPU tensor — per-row global scale for L2 GEMM
    """
    from dsv4.kernels.cuda.loader import get_cuda_module
    # Compute gsa from the fused output
    amax_mod = get_cuda_module("amax_gsa", ["amax_gsa.cu"])
    gsa_gpu = amax_mod.compute_amax_gsa(fused_bf16, divisor)
    M = fused_bf16.shape[0]
    if gsa_gpu.dim() == 0:
        gsa_gpu = gsa_gpu.reshape(1).expand(M).contiguous()
    elif gsa_gpu.shape[0] == 1 and M > 1:
        gsa_gpu = gsa_gpu.expand(M).contiguous()
    # Deinterleave + quantize using gsa from GPU buffer
    quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
    x_fp4, x_sf = quant_mod.deinterleave_quantize_from_buffer(fused_bf16, intermediate, granularity, gsa_gpu)
    return x_fp4, x_sf, gsa_gpu


def compute_amax_gsa_gpu(x_bf16, divisor=6.0 * 448.0):
    """Compute gsa = max(|x|) / divisor on GPU. No CPU sync.
    
    Returns a scalar GPU tensor (not a Python float!).
    
    NOTE: Prefer quantize_nvfp4_gpu_fused() which does amax+quantize in
    one kernel launch. This function is kept for cases where you need gsa
    without quantization.
    """
    from dsv4.kernels.cuda.loader import get_cuda_module
    mod = get_cuda_module("amax_gsa", ["amax_gsa.cu"])
    return mod.compute_amax_gsa(x_bf16, divisor)


def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0):
    """Fused amax + gsa + quantize: zero CPU syncs, two kernel launches.
    
    Two-kernel approach (correct cross-CTA reduction):
      Kernel 1: compute_amax_gsa — row-wise amax → gsa on GPU (no .item())
      Kernel 2: quantize_nvfp4_from_buffer — quantize using gsa from GPU buffer
    
    The previous single-kernel approach had a race condition: the cross-CTA
    shared memory reduction used __syncthreads() which only syncs within a
    CTA, not across CTAs in the same grid. CTA 0 could read s_amax[b] before
    CTA b had written it, producing garbage gsa values.
    
    Args:
        x_bf16: (M, N) BF16 tensor. N must be a multiple of 16.
        divisor: gsa = amax / divisor. Default 6.0 * 448.0 = 2688.0.
    
    Returns:
        x_fp4: (M, N//2) float4_e2m1fn_x2
        x_sf: (M, N//16) float8_e4m3fn
        gsa: (M,) float32 GPU tensor — per-row global scale for GEMM
    """
    # CUDA kernels require contiguous input — column slices from deinterleave are non-contiguous.
    # For CUDA graph capture, this MUST be contiguous at graph construction time.
    # The .contiguous() call is a no-op when already contiguous (no allocation).
    if not x_bf16.is_contiguous():
        x_bf16 = x_bf16.contiguous()
    from dsv4.kernels.cuda.loader import get_cuda_module
    amax_mod = get_cuda_module("amax_gsa", ["amax_gsa.cu"])
    gsa_gpu = amax_mod.compute_amax_gsa(x_bf16, divisor)  # scalar GPU tensor
    # Broadcast to (M,) for the quantize-from-buffer kernel.
    # CUDA-graph-safe approach:
    # - For M=1 decode (graph-captured): just reshape to (1,) — no allocation.
    # - For M>1 prefill (not graph-captured): expand + contiguous is fine.
    M = x_bf16.shape[0]
    if gsa_gpu.dim() == 0:
        gsa_gpu = gsa_gpu.reshape(1)  # scalar → (1,) — no allocation
    if M > 1:
        gsa_gpu = gsa_gpu.expand(M).contiguous()  # (M,) — allocation OK for prefill
    # For M=1: gsa_gpu is (1,) contiguous — zero allocation
    quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
    x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu)
    return x_fp4, x_sf, gsa_gpu


def quantize_nvfp4_gpu(x_bf16, global_scale):
    """Quantize BF16 tensor to NVFP4 using a custom CUDA kernel (GPU-only, no CPU sync).
    
    Replaces quantize_activation_nvfp4() which uses .amax() (CPU sync).
    The global_scale must be pre-computed (from warmup or known value).
    
    NOTE: Prefer quantize_nvfp4_gpu_fused() which also computes gsa on GPU.
    This function is kept for cases where global_scale is already known.
    
    Args:
        x_bf16: (M, N) BF16 tensor. N must be a multiple of 16.
        global_scale: float32 scalar (pre-computed, NOT from .max())
    
    Returns:
        x_fp4: (M, N//2) float4_e2m1fn_x2
        x_sf: (M, N//16) float8_e4m3fn
    """
    from dsv4.kernels.cuda.loader import get_cuda_module
    mod = get_cuda_module("quantize_nvfp4", ["quantize_nvfp4.cu"])
    return mod.quantize_nvfp4(x_bf16, global_scale)


class QuantizedActivation:
    """Pre-quantized NVFP4 activation tensor.
    
    Carries the FP4 data, block scales, and per-row global scale
    so downstream Nvfp4Linear calls can skip quantization and go
    straight to GEMM.
    
    Created by rmsnorm_quantize_nvfp4() or quantize_nvfp4_gpu_fused().
    Consumed by Nvfp4Linear.run_from_quantized().
    """
    __slots__ = ['x_fp4', 'x_sf', 'gsa', 'inv_rms', 'num_tokens']
    
    def __init__(self, x_fp4, x_sf, gsa, inv_rms=None):
        self.x_fp4 = x_fp4  # (M, N//2) FP4
        self.x_sf = x_sf    # (M, N//16) E4M3
        self.gsa = gsa      # (M,) FP32
        self.inv_rms = inv_rms  # (M,) FP32, optional
        self.num_tokens = x_fp4.shape[0]


def dequantize_nvfp4(x_fp4, x_sf, gsa, shape=None):
    """Dequantize NVFP4 → BF16 using the CUDA dequant kernel.
    
    Args:
        x_fp4: (M, N//2) FP4 packed
        x_sf: (M, N//16) E4M3 block scales
        gsa: (M,) or (M, 1) or (1,) FP32 global scale per row
        shape: unused, kept for API compat
    
    Returns:
        (M, N) BF16 tensor
    """
    from dsv4.kernels.cuda.loader import get_cuda_module
    mod = get_cuda_module("dequant_nvfp4", ["dequant_nvfp4.cu"])
    if gsa.dim() == 2:
        gsa = gsa.squeeze(1)  # (M, 1) → (M,)
    # dequant kernel expects uint8 for both fp4 and sf
    if x_fp4.dtype != torch.uint8:
        x_fp4 = x_fp4.view(torch.uint8)
    if x_sf.dtype != torch.uint8:
        x_sf = x_sf.view(torch.uint8)
    return mod.dequant_nvfp4(x_fp4, x_sf, gsa)


def mhc_rmsnorm_quantize_nvfp4(X_l, A_l, norm_weight, eps=1e-6, divisor=6.0 * 448.0):
    """Fused mHC pre_block + RMSNorm + NVFP4 quantize: 2 kernel launches total.
    
    Replaces: bmm (1 launch) + rmsnorm (4+ launches) + quantize (2 launches)
    Total unfused: 7+ launches per site × 122 sites = 854+ launches/token
    Fused: 2 launches per site × 122 sites = 244 launches → 610 launches saved/token.
    
    Args:
        X_l: (M, n_hc, N) BF16 tensor. n_hc must be <= 4, N multiple of 16.
        A_l: (M, n_hc) BF16 tensor. Softmax weights from mHC._dynamic_params.
        norm_weight: (N,) FP32 RMSNorm weight.
        eps: RMSNorm epsilon (default 1e-6).
        divisor: gsa = amax / divisor. Default 6.0 * 448.0 = 2688.0.
    
    Returns:
        QuantizedActivation with x_fp4, x_sf, gsa, inv_rms
    """
    from dsv4.kernels.cuda.loader import get_cuda_module
    mod = get_cuda_module("fused_mhc_rmsnorm_quantize", ["fused_mhc_rmsnorm_quantize.cu"])
    x_fp4, x_sf, gsa, inv_rms = mod.mhc_rmsnorm_quantize_nvfp4(X_l, A_l, norm_weight, eps, divisor)
    return QuantizedActivation(x_fp4, x_sf, gsa, inv_rms)


def rmsnorm_quantize_nvfp4(x_bf16, norm_weight, eps=1e-6, divisor=6.0 * 448.0):
    """Fused RMSNorm + amax + NVFP4 quantize: 2 kernel launches total.
    
    Replaces the unfused path:
      rmsnorm(x, weight) → 4+ BF16 launches
      quantize_nvfp4_gpu_fused(rmsnormed) → 2 kernel launches + amax
    Total unfused: 6+ launches per call × 122 calls/layer-step = 732+ launches/token
    
    Fused: 2 kernel launches per call × 122 calls = 244 launches → 488 launches saved/token.
    
    Two-kernel approach (correct cross-CTA reduction):
      Kernel 1: compute RMS + amax of normalized output → gsa per row (GPU buffer)
      Kernel 2: normalize + quantize using gsa from GPU buffer (no CPU sync)
    
    Args:
        x_bf16: (M, N) BF16 tensor. N must be a multiple of 16.
        norm_weight: (N,) FP32 RMSNorm weight.
        eps: RMSNorm epsilon (default 1e-6).
        divisor: gsa = amax / divisor. Default 6.0 * 448.0 = 2688.0.
    
    Returns:
        x_fp4: (M, N//2) FP4 packed (uint8 view of float4_e2m1fn_x2)
        x_sf: (M, N//16) E4M3 block scales
        gsa: (M,) FP32 per-row global scale for GEMM
        inv_rms: (M,) FP32 per-row 1/RMS (useful for downstream if needed)
    """
    from dsv4.kernels.cuda.loader import get_cuda_module
    mod = get_cuda_module("fused_rmsnorm_quantize", ["fused_rmsnorm_quantize.cu"])
    x_fp4, x_sf, gsa, inv_rms = mod.rmsnorm_quantize_nvfp4(x_bf16, norm_weight, eps, divisor)
    return QuantizedActivation(x_fp4, x_sf, gsa, inv_rms)
-												Restructure: cutedsl/ -> dsv4/ with proper layering

- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py
- Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc.
- Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda)
- Moved PyTorch bridges to dsv4/ops/
- Moved nn.Module layers to dsv4layers/
- Moved reference implementations to dsv4/reference/
- Moved vendored CUTLASS code to vendored/
- Archived ~190 debug tests to tests/archive/
- Kept ~15 canonical tests in tests/unit/
- Updated all import paths
- Added stubs for future components (model/, cache/, loader/)
- Updated pyproject.toml: dsv4-inference package name

											
										
										
											2026-05-21 17:30:44 +00:00
+								"""NVFP4 quantization: BF16 <-> NVFP4 conversion, scale factor computation."""
 								import math
 								import torch
 								import cutlass
 								import cutlass.cute as cute
 								import cutlass.torch as cutlass_torch
 								import cutlass.utils as utils
-												fix: import ceil_div in quantize.py (was NameError at runtime)

											
										
										
											2026-05-23 08:40:24 +00:00
+								from dsv4.ops.layouts import ceil_div
-												Restructure: cutedsl/ -> dsv4/ with proper layering

- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py
- Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc.
- Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda)
- Moved PyTorch bridges to dsv4/ops/
- Moved nn.Module layers to dsv4layers/
- Moved reference implementations to dsv4/reference/
- Moved vendored CUTLASS code to vendored/
- Archived ~190 debug tests to tests/archive/
- Kept ~15 canonical tests in tests/unit/
- Updated all import paths
- Added stubs for future components (model/, cache/, loader/)
- Updated pyproject.toml: dsv4-inference package name

											
										
										
											2026-05-21 17:30:44 +00:00
 								from dsv4.kernels.gemm.grouped import (
 								    cat_byte_reinterpretable_tensors,
 								    stack_byte_reinterpretable_tensors,
 								)
 								E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
 								# Cache compiled kernels + pre-allocated workspace by cache_key
 								# Each entry: {'compiled': callable, 'workspace': Tensor, 'workspace_size': int}
 								#
 								# Key design decisions (Bug #1 fix):
 								# - cute.compile does NOT corrupt GPU memory (verified 2026-05-20 on B200).
 								#   The original _needs_token_refill hack was a misdiagnosis. The real bug
 								#   was elsewhere (likely OOB write or weight loading).
 								# - Workspace is pre-allocated per cache entry during warmup_compilation()
 								#   and reused on subsequent calls. No torch.full() in the hot path.
 								# - CuTe tensor wrappers (from_dlpack + mark_layout_dynamic) are cheap
 								#   metadata wrappers. We re-create them per call from real tensors.
 								#   Caching them would hold stale references to tensors that get freed.
 								# Cached LUT for E2M1 quantization (created once per device, cudagraph-safe)
 								_NVFP4_STEP_LUT_CACHE = {}
 								def _get_step_to_idx_lut(device):
 								    """Get or create the E2M1 step-to-index LUT for the given device.
 								    Cached per device to avoid CPU->CUDA copies during cudagraph capture.
 								    Must be pre-populated during warmup (before torch.compile/cudagraph capture)
 								    so the lock is never entered on the compiled path.
 								    """
 								    # Fast path: already cached — no lock needed (torch.compile-safe)
 								    if device in _NVFP4_STEP_LUT_CACHE:
 								        return _NVFP4_STEP_LUT_CACHE[device]
 								    # Slow path: first call, create the LUT
 								    lut = torch.as_tensor(
 								        [0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7],
 								        dtype=torch.int8, device=device,
 								    )
 								    _NVFP4_STEP_LUT_CACHE[device] = lut
 								    return lut
 								SF_VEC_SIZE = 16  # NVFP4 block size
 								def quantize_to_nvfp4(x_bf16, block_size=SF_VEC_SIZE):
 								    """Quantize BF16 tensor to NVFP4.
 								    Args:
 								        x_bf16: (..., D) BF16 tensor
 								    Returns:
 								        x_fp4: (..., D//2) float4_e2m1fn_x2 — native PyTorch FP4
 								        x_sf: (..., D//16) float8_e4m3fn — block scales
 								        global_scale: float32 scalar
 								    """
 								    x_f32 = x_bf16.float()
 								    amax = x_f32.abs().max().clamp(min=1e-8).float()
 								    global_scale = amax / (6.0 * 448.0)
 								    x_norm = x_f32 / global_scale
 								    last_dim = x_norm.shape[-1]
 								    n_blocks = ceil_div(last_dim, block_size)
 								    if last_dim % block_size != 0:
 								        pad_size = n_blocks * block_size - last_dim
 								        x_norm = torch.nn.functional.pad(x_norm, (0, pad_size))
 								    x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size)
 								    block_amax = x_reshaped.abs().amax(dim=-1)
 								    # Detect zero blocks and underflow blocks (amax > 0 but too small for FP8).
 								    # Smallest positive FP8 e4m3fn is 2^-9 ≈ 1.95e-3. If amax/6 < this,
 								    # the block scale underflows to 0, and dividing x by the clamped 1e-8
 								    # inflates values into nonzero FP4 buckets — producing wrong results.
 								    zero_block = block_amax < (6.0 * 2.0 ** -9)  # < ~0.0117
 								    # Zero out x for zero/underflow blocks before division.
 								    # This ensures x_scaled = 0 → FP4 nibbles = 0.
-												CUDA graph: Fix per-call allocations in grouped_linear and quantize

1. grouped_linear.py: Pre-allocate _scale_a_buf for swizzle
   - Same fix as linear.py — avoids torch.zeros per call
   - Uses correctly-sized view for pad_and_swizzle_single

2. quantize.py: Replace torch.zeros_like with scalar 0.0
   - torch.zeros_like allocates a full tensor every call
   - torch.where(cond, 0.0, x) broadcasts scalar — no allocation

											
										
										
											2026-06-03 17:39:20 +00:00
+								    # Use scalar 0.0 instead of torch.zeros_like — no allocation, graph-safe.
 								    x_reshaped = torch.where(zero_block.unsqueeze(-1), 0.0, x_reshaped)
-												Restructure: cutedsl/ -> dsv4/ with proper layering

- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py
- Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc.
- Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda)
- Moved PyTorch bridges to dsv4/ops/
- Moved nn.Module layers to dsv4layers/
- Moved reference implementations to dsv4/reference/
- Moved vendored CUTLASS code to vendored/
- Archived ~190 debug tests to tests/archive/
- Kept ~15 canonical tests in tests/unit/
- Updated all import paths
- Added stubs for future components (model/, cache/, loader/)
- Updated pyproject.toml: dsv4-inference package name

											
										
										
											2026-05-21 17:30:44 +00:00
+								    block_amax = block_amax.clamp(min=1e-8)
 								    block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn)
 								    # Force zero/underflow blocks: FP8 scale = 0 (exact zero).
-												CUDA graph: Fix per-call allocations in grouped_linear and quantize

1. grouped_linear.py: Pre-allocate _scale_a_buf for swizzle
   - Same fix as linear.py — avoids torch.zeros per call
   - Uses correctly-sized view for pad_and_swizzle_single

2. quantize.py: Replace torch.zeros_like with scalar 0.0
   - torch.zeros_like allocates a full tensor every call
   - torch.where(cond, 0.0, x) broadcasts scalar — no allocation

											
										
										
											2026-06-03 17:39:20 +00:00
+								    block_scale = torch.where(zero_block, 0.0, block_scale)
-												Restructure: cutedsl/ -> dsv4/ with proper layering

- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py
- Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc.
- Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda)
- Moved PyTorch bridges to dsv4/ops/
- Moved nn.Module layers to dsv4layers/
- Moved reference implementations to dsv4/reference/
- Moved vendored CUTLASS code to vendored/
- Archived ~190 debug tests to tests/archive/
- Kept ~15 canonical tests in tests/unit/
- Updated all import paths
- Added stubs for future components (model/, cache/, loader/)
- Updated pyproject.toml: dsv4-inference package name

											
										
										
											2026-05-21 17:30:44 +00:00
 								    # Nearest E2M1
 								    block_sf_expanded = block_scale.float().unsqueeze(-1)
 								    x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8)
 								    signs = torch.sign(x_scaled)
 								    abs_scaled = x_scaled.abs().clamp(max=6.0)
 								    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
 								    step_to_idx = _get_step_to_idx_lut(x_bf16.device)
 								    indices = step_to_idx[half_steps.long()]
 								    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
 								    even = nibbles[..., ::2]
 								    odd = nibbles[..., 1::2]
 								    packed = (odd << 4) | even
 								    packed_shape = list(x_bf16.shape)
 								    packed_shape[-1] = last_dim // 2
 								    x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape)
 								    sf_shape = list(x_bf16.shape[:-1]) + [n_blocks]
 								    block_scale = block_scale.reshape(sf_shape)
 								    return x_fp4, block_scale, global_scale
 								def quantize_activation_nvfp4(x_bf16, global_scale, block_size=SF_VEC_SIZE):
 								    """Quantize BF16 activation tensor to NVFP4 (cudagraph-safe).
 								    Unlike quantize_to_nvfp4(), this takes a pre-computed global_scale
 								    instead of computing it via .max() (which forces CPU-GPU sync).
 								    All operations are pure GPU with no CPU-GPU syncs.
 								    Args:
 								        x_bf16: (..., D) BF16 tensor
 								        global_scale: float32 scalar (pre-computed, NOT from .max())
 								        block_size: NVFP4 block size
 								    Returns:
 								        x_fp4: (..., D//2) float4_e2m1fn_x2
 								        x_sf: (..., D//16) float8_e4m3fn
 								    """
 								    x_f32 = x_bf16.float()
 								    x_norm = x_f32 / global_scale
 								    last_dim = x_norm.shape[-1]
 								    n_blocks = ceil_div(last_dim, block_size)
 								    if last_dim % block_size != 0:
 								        pad_size = n_blocks * block_size - last_dim
 								        x_norm = torch.nn.functional.pad(x_norm, (0, pad_size))
 								    x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size)
 								    block_amax = x_reshaped.abs().amax(dim=-1)
 								    # Detect zero blocks and underflow blocks (same threshold as quantize_to_nvfp4).
 								    zero_block = block_amax < (6.0 * 2.0 ** -9)
-												CUDA graph: Eliminate per-step allocations in graph-captured code paths

- gemm_runner.py: Add out= parameter to run_nvfp4_grouped_gemm and
  run_fused_swiglu_grouped_gemm to accept pre-allocated output buffers
- quantize.py: Replace torch.zeros_like/torch.zeros with scalar 0.0 in
  torch.where() calls (graph-capturable, no memory allocation)
- Both fixes prevent 'Disallowed operation during CUDA stream capture'
  errors during graph capture

											
										
										
											2026-06-03 21:30:24 +00:00
+								    x_reshaped = torch.where(zero_block.unsqueeze(-1), 0.0, x_reshaped)
-												fix: clamp block_amax to E4M3 max (448) in quantize_activation_nvfp4 — prevents NaN from overflow

											
										
										
											2026-06-01 04:59:06 +00:00
+								    block_amax = block_amax.clamp(min=1e-8, max=6.0 * 448.0)  # E4M3 max = 448
-												Restructure: cutedsl/ -> dsv4/ with proper layering

- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py
- Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc.
- Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda)
- Moved PyTorch bridges to dsv4/ops/
- Moved nn.Module layers to dsv4layers/
- Moved reference implementations to dsv4/reference/
- Moved vendored CUTLASS code to vendored/
- Archived ~190 debug tests to tests/archive/
- Kept ~15 canonical tests in tests/unit/
- Updated all import paths
- Added stubs for future components (model/, cache/, loader/)
- Updated pyproject.toml: dsv4-inference package name

											
										
										
											2026-05-21 17:30:44 +00:00
+								    block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn)
-												CUDA graph: Eliminate per-step allocations in graph-captured code paths

- gemm_runner.py: Add out= parameter to run_nvfp4_grouped_gemm and
  run_fused_swiglu_grouped_gemm to accept pre-allocated output buffers
- quantize.py: Replace torch.zeros_like/torch.zeros with scalar 0.0 in
  torch.where() calls (graph-capturable, no memory allocation)
- Both fixes prevent 'Disallowed operation during CUDA stream capture'
  errors during graph capture

											
										
										
											2026-06-03 21:30:24 +00:00
+								    block_scale = torch.where(zero_block, 0.0, block_scale)
-												Restructure: cutedsl/ -> dsv4/ with proper layering

- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py
- Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc.
- Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda)
- Moved PyTorch bridges to dsv4/ops/
- Moved nn.Module layers to dsv4layers/
- Moved reference implementations to dsv4/reference/
- Moved vendored CUTLASS code to vendored/
- Archived ~190 debug tests to tests/archive/
- Kept ~15 canonical tests in tests/unit/
- Updated all import paths
- Added stubs for future components (model/, cache/, loader/)
- Updated pyproject.toml: dsv4-inference package name

											
										
										
											2026-05-21 17:30:44 +00:00
 								    block_sf_expanded = block_scale.float().unsqueeze(-1)
 								    x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8)
 								    signs = torch.sign(x_scaled)
 								    abs_scaled = x_scaled.abs().clamp(max=6.0)
 								    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
 								    step_to_idx = _get_step_to_idx_lut(x_bf16.device)
 								    indices = step_to_idx[half_steps.long()]
 								    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
 								    even = nibbles[..., ::2]
 								    odd = nibbles[..., 1::2]
 								    packed = (odd << 4) | even
 								    packed_shape = list(x_bf16.shape)
 								    packed_shape[-1] = last_dim // 2
 								    x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape)
 								    sf_shape = list(x_bf16.shape[:-1]) + [n_blocks]
 								    block_scale = block_scale.reshape(sf_shape)
 								    return x_fp4, block_scale
 								def quantize_weight_to_nvfp4(w_bf16, block_size=SF_VEC_SIZE):
 								    """Quantize BF16 weight matrix to NVFP4.
 								    The weight is (K, N) where K is the input dim (packed dimension).
 								    Block scales are computed along K (dim 0).
 								    Args:
 								        w_bf16: (K, N) BF16 weight matrix
 								    Returns:
 								        w_fp4: (K//2, N) float4_e2m1fn_x2 — K is the packed dim
 								        w_sf: (K//16, N) float8_e4m3fn — block scales along K
 								        global_scale: float32 scalar
 								    """
 								    K, N = w_bf16.shape
 								    w_f32 = w_bf16.float()
 								    amax = w_f32.abs().max().clamp(min=1e-8).float()
 								    global_scale = amax / (6.0 * 448.0)
 								    w_norm = w_f32 / global_scale
 								    k_blocks = ceil_div(K, block_size)
 								    if K % block_size != 0:
 								        w_norm = torch.nn.functional.pad(w_norm, (0, 0, 0, k_blocks * block_size - K))
 								    w_reshaped = w_norm.reshape(k_blocks, block_size, N)
 								    w_block_amax = w_reshaped.abs().amax(dim=1)
 								    # Detect zero blocks and underflow blocks (same threshold).
 								    zero_block = w_block_amax < (6.0 * 2.0 ** -9)
 								    w_reshaped = torch.where(zero_block.unsqueeze(1),
 								                              torch.zeros_like(w_reshaped), w_reshaped)
 								    w_block_amax = w_block_amax.clamp(min=1e-8)
 								    w_sf = (w_block_amax / 6.0).to(torch.float8_e4m3fn)
 								    w_sf = torch.where(zero_block, torch.zeros_like(w_sf), w_sf)
 								    w_block_sf = w_sf.float().unsqueeze(1)
 								    w_scaled = w_reshaped / w_block_sf.clamp(min=1e-8)
 								    signs = torch.sign(w_scaled)
 								    abs_scaled = w_scaled.abs().clamp(max=6.0)
 								    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
 								    step_to_idx = _get_step_to_idx_lut(w_bf16.device)
 								    indices = step_to_idx[half_steps.long()]
 								    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
 								    even = nibbles[:, ::2, :]
 								    odd = nibbles[:, 1::2, :]
 								    packed = (odd << 4) | even
 								    w_fp4 = packed.reshape(K // 2, N).view(torch.float4_e2m1fn_x2)
 								    return w_fp4, w_sf, global_scale
 								# ── Scale Factor Assembly ─────────────────────────────────────────────
 								def deinterleave_quantize_nvfp4_cuda(fused_bf16, intermediate, global_scale, granularity=8):
 								    """De-interleave + quantize fused SwiGLU output using a custom CUDA kernel.
 								    Single kernel launch, no Python loop. 4x faster than the Python path.
 								    Args:
 								        fused_bf16: (M, 2*intermediate) BF16 — fused L1 output with interleaved gate/up
 								        intermediate: intermediate dimension (e.g., 3072)
 								        global_scale: pre-computed global scale for quantization
 								        granularity: interleave granularity in BF16 columns (default 8)
 								    Returns:
 								        x_fp4: (M, intermediate//2) float4_e2m1fn_x2 — quantized SwiGLU
 								        x_sf: (M, intermediate//16) float8_e4m3fn — block scales
 								    """
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
+								    from dsv4.kernels.cuda.loader import get_cuda_module
 								    mod = get_cuda_module("deinterleave_quantize_nvfp4", ["deinterleave_quantize.cu"])
-												Restructure: cutedsl/ -> dsv4/ with proper layering

- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py
- Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc.
- Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda)
- Moved PyTorch bridges to dsv4/ops/
- Moved nn.Module layers to dsv4layers/
- Moved reference implementations to dsv4/reference/
- Moved vendored CUTLASS code to vendored/
- Archived ~190 debug tests to tests/archive/
- Kept ~15 canonical tests in tests/unit/
- Updated all import paths
- Added stubs for future components (model/, cache/, loader/)
- Updated pyproject.toml: dsv4-inference package name

											
										
										
											2026-05-21 17:30:44 +00:00
+								    return mod.deinterleave_quantize_nvfp4(fused_bf16, intermediate, granularity, global_scale)
-												NVFP4-1.1 integration: GPU-only quantize kernel + MoE pipeline wiring

- Add quantize_nvfp4.cu: BF16→FP4 GPU kernel (no CPU sync, warp shuffle amax)
- Add quantize_nvfp4_gpu() bridge in ops/quantize.py
- Fix deinterleave_quantize kernel path (dsv4/ops/kernels → dsv4/kernels/cuda)
- Wire GPU quantize into Nvfp4MoE._run_impl():
  - L1 input: quantize_nvfp4_gpu (replaces quantize_activation_nvfp4)
  - Fused SwiGLU L2: deinterleave_quantize_nvfp4_cuda (single kernel)
  - Non-fused L2: quantize_nvfp4_gpu
- Add test_nvfp4_gpu_quantize.py for both kernels

											
										
										
											2026-05-25 16:19:04 +00:00
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
+								def deinterleave_amax_quantize_nvfp4_fused(fused_bf16, intermediate, divisor=6.0 * 448.0, granularity=8):
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    """Fused deinterleave + amax + quantize: zero CPU syncs, two kernel launches.
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    For the MoE fused_swiglu L2 path. Two-kernel approach (correct):
 								      Kernel 1: compute_amax_gsa on the de-interleaved values (GPU-only)
 								      Kernel 2: deinterleave_quantize_from_buffer using gsa from GPU buffer
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
 								    Args:
 								        fused_bf16: (M, 2*intermediate) BF16 — fused L1 output
 								        intermediate: intermediate dimension
 								        divisor: gsa = amax / divisor. Default 2688.0.
 								        granularity: interleave granularity (default 8)
 								    Returns:
 								        x_fp4: (M, intermediate//2) float4_e2m1fn_x2
 								        x_sf: (M, intermediate//16) float8_e4m3fn
 								        gsa: (M,) float32 GPU tensor — per-row global scale for L2 GEMM
 								    """
 								    from dsv4.kernels.cuda.loader import get_cuda_module
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    # Compute gsa from the fused output
 								    amax_mod = get_cuda_module("amax_gsa", ["amax_gsa.cu"])
 								    gsa_gpu = amax_mod.compute_amax_gsa(fused_bf16, divisor)
-												Fix gsa_buffer shape mismatch for MoE (M>1 rows)

compute_amax_gsa returns a scalar, but quantize_from_buffer expects (M,).
Broadcast the scalar gsa to (M,) — all rows use the same gsa (global max).

											
										
										
											2026-06-01 21:33:59 +00:00
+								    M = fused_bf16.shape[0]
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    if gsa_gpu.dim() == 0:
-												Fix gsa_buffer shape mismatch for MoE (M>1 rows)

compute_amax_gsa returns a scalar, but quantize_from_buffer expects (M,).
Broadcast the scalar gsa to (M,) — all rows use the same gsa (global max).

											
										
										
											2026-06-01 21:33:59 +00:00
+								        gsa_gpu = gsa_gpu.reshape(1).expand(M).contiguous()
 								    elif gsa_gpu.shape[0] == 1 and M > 1:
 								        gsa_gpu = gsa_gpu.expand(M).contiguous()
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    # Deinterleave + quantize using gsa from GPU buffer
 								    quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
 								    x_fp4, x_sf = quant_mod.deinterleave_quantize_from_buffer(fused_bf16, intermediate, granularity, gsa_gpu)
 								    return x_fp4, x_sf, gsa_gpu
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
-												Performance audit fixes: eliminate CPU-GPU syncs

PERFORMANCE_AUDIT.md validation results:
  1. Nvfp4Linear .item() sync (610/step) → FIXED: compute_amax_gsa_gpu kernel
  2. MoE .item() sync (183/step) → FIXED: same kernel
  3. SharedExpert .item() sync (122/step) → FIXED: same kernel
  4. FMHA V clone → FIXED: V=K, transpose creates copy implicitly
  5. torch.cuda.synchronize in moe_forward → FIXED: conditional on VERBOSE
  6. RoPE 8x duplication → INVALIDATED: necessary for per-GPU HBM access
  7. mHC BF16 bmm → INVALIDATED: 28K FLOPs, not a bottleneck
  8. Router .float() cast → INVALIDATED: needed for FP32 topk, ~1μs

New files:
  - dsv4/kernels/cuda/amax_gsa.cu: GPU-only amax→gsa kernel
  - dsv4/ops/quantize.py: compute_amax_gsa_gpu() wrapper

Net effect: ~915 fewer CPU-GPU syncs per decode step
Remaining syncs: ~10 per layer (quantize kernel parameter) + diagnostics

											
										
										
											2026-06-01 20:40:19 +00:00
+								def compute_amax_gsa_gpu(x_bf16, divisor=6.0 * 448.0):
 								    """Compute gsa = max(|x|) / divisor on GPU. No CPU sync.
 								    Returns a scalar GPU tensor (not a Python float!).
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
+								    NOTE: Prefer quantize_nvfp4_gpu_fused() which does amax+quantize in
 								    one kernel launch. This function is kept for cases where you need gsa
 								    without quantization.
-												Performance audit fixes: eliminate CPU-GPU syncs

PERFORMANCE_AUDIT.md validation results:
  1. Nvfp4Linear .item() sync (610/step) → FIXED: compute_amax_gsa_gpu kernel
  2. MoE .item() sync (183/step) → FIXED: same kernel
  3. SharedExpert .item() sync (122/step) → FIXED: same kernel
  4. FMHA V clone → FIXED: V=K, transpose creates copy implicitly
  5. torch.cuda.synchronize in moe_forward → FIXED: conditional on VERBOSE
  6. RoPE 8x duplication → INVALIDATED: necessary for per-GPU HBM access
  7. mHC BF16 bmm → INVALIDATED: 28K FLOPs, not a bottleneck
  8. Router .float() cast → INVALIDATED: needed for FP32 topk, ~1μs

New files:
  - dsv4/kernels/cuda/amax_gsa.cu: GPU-only amax→gsa kernel
  - dsv4/ops/quantize.py: compute_amax_gsa_gpu() wrapper

Net effect: ~915 fewer CPU-GPU syncs per decode step
Remaining syncs: ~10 per layer (quantize kernel parameter) + diagnostics

											
										
										
											2026-06-01 20:40:19 +00:00
+								    """
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
+								    from dsv4.kernels.cuda.loader import get_cuda_module
 								    mod = get_cuda_module("amax_gsa", ["amax_gsa.cu"])
-												Performance audit fixes: eliminate CPU-GPU syncs

PERFORMANCE_AUDIT.md validation results:
  1. Nvfp4Linear .item() sync (610/step) → FIXED: compute_amax_gsa_gpu kernel
  2. MoE .item() sync (183/step) → FIXED: same kernel
  3. SharedExpert .item() sync (122/step) → FIXED: same kernel
  4. FMHA V clone → FIXED: V=K, transpose creates copy implicitly
  5. torch.cuda.synchronize in moe_forward → FIXED: conditional on VERBOSE
  6. RoPE 8x duplication → INVALIDATED: necessary for per-GPU HBM access
  7. mHC BF16 bmm → INVALIDATED: 28K FLOPs, not a bottleneck
  8. Router .float() cast → INVALIDATED: needed for FP32 topk, ~1μs

New files:
  - dsv4/kernels/cuda/amax_gsa.cu: GPU-only amax→gsa kernel
  - dsv4/ops/quantize.py: compute_amax_gsa_gpu() wrapper

Net effect: ~915 fewer CPU-GPU syncs per decode step
Remaining syncs: ~10 per layer (quantize kernel parameter) + diagnostics

											
										
										
											2026-06-01 20:40:19 +00:00
+								    return mod.compute_amax_gsa(x_bf16, divisor)
 								def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0):
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    """Fused amax + gsa + quantize: zero CPU syncs, two kernel launches.
-												Performance audit fixes: eliminate CPU-GPU syncs

PERFORMANCE_AUDIT.md validation results:
  1. Nvfp4Linear .item() sync (610/step) → FIXED: compute_amax_gsa_gpu kernel
  2. MoE .item() sync (183/step) → FIXED: same kernel
  3. SharedExpert .item() sync (122/step) → FIXED: same kernel
  4. FMHA V clone → FIXED: V=K, transpose creates copy implicitly
  5. torch.cuda.synchronize in moe_forward → FIXED: conditional on VERBOSE
  6. RoPE 8x duplication → INVALIDATED: necessary for per-GPU HBM access
  7. mHC BF16 bmm → INVALIDATED: 28K FLOPs, not a bottleneck
  8. Router .float() cast → INVALIDATED: needed for FP32 topk, ~1μs

New files:
  - dsv4/kernels/cuda/amax_gsa.cu: GPU-only amax→gsa kernel
  - dsv4/ops/quantize.py: compute_amax_gsa_gpu() wrapper

Net effect: ~915 fewer CPU-GPU syncs per decode step
Remaining syncs: ~10 per layer (quantize kernel parameter) + diagnostics

											
										
										
											2026-06-01 20:40:19 +00:00
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    Two-kernel approach (correct cross-CTA reduction):
 								      Kernel 1: compute_amax_gsa — row-wise amax → gsa on GPU (no .item())
 								      Kernel 2: quantize_nvfp4_from_buffer — quantize using gsa from GPU buffer
-												Performance audit fixes: eliminate CPU-GPU syncs

PERFORMANCE_AUDIT.md validation results:
  1. Nvfp4Linear .item() sync (610/step) → FIXED: compute_amax_gsa_gpu kernel
  2. MoE .item() sync (183/step) → FIXED: same kernel
  3. SharedExpert .item() sync (122/step) → FIXED: same kernel
  4. FMHA V clone → FIXED: V=K, transpose creates copy implicitly
  5. torch.cuda.synchronize in moe_forward → FIXED: conditional on VERBOSE
  6. RoPE 8x duplication → INVALIDATED: necessary for per-GPU HBM access
  7. mHC BF16 bmm → INVALIDATED: 28K FLOPs, not a bottleneck
  8. Router .float() cast → INVALIDATED: needed for FP32 topk, ~1μs

New files:
  - dsv4/kernels/cuda/amax_gsa.cu: GPU-only amax→gsa kernel
  - dsv4/ops/quantize.py: compute_amax_gsa_gpu() wrapper

Net effect: ~915 fewer CPU-GPU syncs per decode step
Remaining syncs: ~10 per layer (quantize kernel parameter) + diagnostics

											
										
										
											2026-06-01 20:40:19 +00:00
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    The previous single-kernel approach had a race condition: the cross-CTA
 								    shared memory reduction used __syncthreads() which only syncs within a
 								    CTA, not across CTAs in the same grid. CTA 0 could read s_amax[b] before
 								    CTA b had written it, producing garbage gsa values.
-												Performance audit fixes: eliminate CPU-GPU syncs

PERFORMANCE_AUDIT.md validation results:
  1. Nvfp4Linear .item() sync (610/step) → FIXED: compute_amax_gsa_gpu kernel
  2. MoE .item() sync (183/step) → FIXED: same kernel
  3. SharedExpert .item() sync (122/step) → FIXED: same kernel
  4. FMHA V clone → FIXED: V=K, transpose creates copy implicitly
  5. torch.cuda.synchronize in moe_forward → FIXED: conditional on VERBOSE
  6. RoPE 8x duplication → INVALIDATED: necessary for per-GPU HBM access
  7. mHC BF16 bmm → INVALIDATED: 28K FLOPs, not a bottleneck
  8. Router .float() cast → INVALIDATED: needed for FP32 topk, ~1μs

New files:
  - dsv4/kernels/cuda/amax_gsa.cu: GPU-only amax→gsa kernel
  - dsv4/ops/quantize.py: compute_amax_gsa_gpu() wrapper

Net effect: ~915 fewer CPU-GPU syncs per decode step
Remaining syncs: ~10 per layer (quantize kernel parameter) + diagnostics

											
										
										
											2026-06-01 20:40:19 +00:00
 								    Args:
 								        x_bf16: (M, N) BF16 tensor. N must be a multiple of 16.
 								        divisor: gsa = amax / divisor. Default 6.0 * 448.0 = 2688.0.
 								    Returns:
 								        x_fp4: (M, N//2) float4_e2m1fn_x2
 								        x_sf: (M, N//16) float8_e4m3fn
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
+								        gsa: (M,) float32 GPU tensor — per-row global scale for GEMM
-												Performance audit fixes: eliminate CPU-GPU syncs

PERFORMANCE_AUDIT.md validation results:
  1. Nvfp4Linear .item() sync (610/step) → FIXED: compute_amax_gsa_gpu kernel
  2. MoE .item() sync (183/step) → FIXED: same kernel
  3. SharedExpert .item() sync (122/step) → FIXED: same kernel
  4. FMHA V clone → FIXED: V=K, transpose creates copy implicitly
  5. torch.cuda.synchronize in moe_forward → FIXED: conditional on VERBOSE
  6. RoPE 8x duplication → INVALIDATED: necessary for per-GPU HBM access
  7. mHC BF16 bmm → INVALIDATED: 28K FLOPs, not a bottleneck
  8. Router .float() cast → INVALIDATED: needed for FP32 topk, ~1μs

New files:
  - dsv4/kernels/cuda/amax_gsa.cu: GPU-only amax→gsa kernel
  - dsv4/ops/quantize.py: compute_amax_gsa_gpu() wrapper

Net effect: ~915 fewer CPU-GPU syncs per decode step
Remaining syncs: ~10 per layer (quantize kernel parameter) + diagnostics

											
										
										
											2026-06-01 20:40:19 +00:00
+								    """
-												CUDA graph: Fix sync violations (Category 1-2)

1. mhc.py: Remove .item() from post_block (122 syncs/step eliminated)
   - The X_next.abs().max().item() was syncing EVERY layer's post_block
   - Diagnostics moved to caller (outside graph region)

2. linear.py: Pre-allocate _scale_a_buf in _ensure_buffer_size
   - _assemble_scales_single_group now uses pre-allocated buffer
   - Eliminates per-call torch.zeros() allocation (graph capture killer)

3. shared_expert.py: Same fix — use pre-allocated padded_x_sf_buf
   - _assemble_scales_single_group no longer allocates

4. quantize.py: Remove .contiguous() from gsa expand
   - expand() creates stride-0 view, CUDA kernel reads correctly
   - No allocation on the hot path

5. Add CUDA_GRAPH_SYNC_INVENTORY.md with full violation catalog

											
										
										
											2026-06-03 16:37:20 +00:00
+								    # CUDA kernels require contiguous input — column slices from deinterleave are non-contiguous.
 								    # For CUDA graph capture, this MUST be contiguous at graph construction time.
 								    # The .contiguous() call is a no-op when already contiguous (no allocation).
-												Fix non-contiguous tensor in quantize_nvfp4_gpu_fused (T>1 prefill)

The intermediate tensor from fused SwiGLU deinterleave is a column slice
(non-contiguous). When T>1, quantize_nvfp4_gpu_fused receives this and
the CUDA kernel crashes with 'input must be contiguous'.

Fix: add is_contiguous() check + .contiguous() in quantize_nvfp4_gpu_fused
and in SharedExpert._run_l2. This is the root cause, not a workaround —
CUDA kernels legitimately require contiguous memory.

											
										
										
											2026-06-03 07:56:19 +00:00
+								    if not x_bf16.is_contiguous():
 								        x_bf16 = x_bf16.contiguous()
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
+								    from dsv4.kernels.cuda.loader import get_cuda_module
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    amax_mod = get_cuda_module("amax_gsa", ["amax_gsa.cu"])
-												Fix gsa_buffer shape mismatch for MoE (M>1 rows)

compute_amax_gsa returns a scalar, but quantize_from_buffer expects (M,).
Broadcast the scalar gsa to (M,) — all rows use the same gsa (global max).

											
										
										
											2026-06-01 21:33:59 +00:00
+								    gsa_gpu = amax_mod.compute_amax_gsa(x_bf16, divisor)  # scalar GPU tensor
-												CUDA graph: Fix gsa broadcast — contiguous for prefill, reshape for decode

The stride-0 expand view for gsa_gpu caused illegal memory access
in quantize_nvfp4_from_buffer kernel. The CUDA kernel may not handle
stride-0 tensors correctly.

Fix:
- M=1 decode (graph-captured): just reshape scalar to (1,) — no alloc
- M>1 prefill (not graph-captured): expand + contiguous — allocation OK

											
										
										
											2026-06-03 18:08:18 +00:00
+								    # Broadcast to (M,) for the quantize-from-buffer kernel.
 								    # CUDA-graph-safe approach:
 								    # - For M=1 decode (graph-captured): just reshape to (1,) — no allocation.
 								    # - For M>1 prefill (not graph-captured): expand + contiguous is fine.
-												Fix gsa_buffer shape mismatch for MoE (M>1 rows)

compute_amax_gsa returns a scalar, but quantize_from_buffer expects (M,).
Broadcast the scalar gsa to (M,) — all rows use the same gsa (global max).

											
										
										
											2026-06-01 21:33:59 +00:00
+								    M = x_bf16.shape[0]
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    if gsa_gpu.dim() == 0:
-												CUDA graph: Fix gsa broadcast — contiguous for prefill, reshape for decode

The stride-0 expand view for gsa_gpu caused illegal memory access
in quantize_nvfp4_from_buffer kernel. The CUDA kernel may not handle
stride-0 tensors correctly.

Fix:
- M=1 decode (graph-captured): just reshape scalar to (1,) — no alloc
- M>1 prefill (not graph-captured): expand + contiguous — allocation OK

											
										
										
											2026-06-03 18:08:18 +00:00
+								        gsa_gpu = gsa_gpu.reshape(1)  # scalar → (1,) — no allocation
 								    if M > 1:
 								        gsa_gpu = gsa_gpu.expand(M).contiguous()  # (M,) — allocation OK for prefill
 								    # For M=1: gsa_gpu is (1,) contiguous — zero allocation
-												CRITICAL FIX: fused_amax_quantize cross-CTA race condition

The single-kernel approach used __syncthreads() for cross-CTA amax
reduction, but __syncthreads() only syncs within a CTA (same blockIdx).
CTA 0 reading s_amax[1] before CTA 1 writes = race condition = garbage gsa.

Result: residual |X| exploded to 10^37 by L0. F_attn and F_ffn were 0.0.

Fix: Two-kernel approach (correct, zero CPU syncs):
  Kernel 1: amax_gsa.cu — computes gsa on GPU, returns GPU tensor
  Kernel 2: quantize_nvfp4_from_buffer — reads gsa from GPU buffer

The fused_amax_quantize.cu now exports quantize_nvfp4_from_buffer and
deinterleave_quantize_from_buffer (gsa from GPU buffer, not kernel param).

Same P0 win: zero .item() syncs. Two kernel launches instead of one,
but correctness > shaving one launch.

											
										
										
											2026-06-01 21:26:51 +00:00
+								    quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
 								    x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu)
 								    return x_fp4, x_sf, gsa_gpu
-												Performance audit fixes: eliminate CPU-GPU syncs

PERFORMANCE_AUDIT.md validation results:
  1. Nvfp4Linear .item() sync (610/step) → FIXED: compute_amax_gsa_gpu kernel
  2. MoE .item() sync (183/step) → FIXED: same kernel
  3. SharedExpert .item() sync (122/step) → FIXED: same kernel
  4. FMHA V clone → FIXED: V=K, transpose creates copy implicitly
  5. torch.cuda.synchronize in moe_forward → FIXED: conditional on VERBOSE
  6. RoPE 8x duplication → INVALIDATED: necessary for per-GPU HBM access
  7. mHC BF16 bmm → INVALIDATED: 28K FLOPs, not a bottleneck
  8. Router .float() cast → INVALIDATED: needed for FP32 topk, ~1μs

New files:
  - dsv4/kernels/cuda/amax_gsa.cu: GPU-only amax→gsa kernel
  - dsv4/ops/quantize.py: compute_amax_gsa_gpu() wrapper

Net effect: ~915 fewer CPU-GPU syncs per decode step
Remaining syncs: ~10 per layer (quantize kernel parameter) + diagnostics

											
										
										
											2026-06-01 20:40:19 +00:00
-												NVFP4-1.1 integration: GPU-only quantize kernel + MoE pipeline wiring

- Add quantize_nvfp4.cu: BF16→FP4 GPU kernel (no CPU sync, warp shuffle amax)
- Add quantize_nvfp4_gpu() bridge in ops/quantize.py
- Fix deinterleave_quantize kernel path (dsv4/ops/kernels → dsv4/kernels/cuda)
- Wire GPU quantize into Nvfp4MoE._run_impl():
  - L1 input: quantize_nvfp4_gpu (replaces quantize_activation_nvfp4)
  - Fused SwiGLU L2: deinterleave_quantize_nvfp4_cuda (single kernel)
  - Non-fused L2: quantize_nvfp4_gpu
- Add test_nvfp4_gpu_quantize.py for both kernels

											
										
										
											2026-05-25 16:19:04 +00:00
+								def quantize_nvfp4_gpu(x_bf16, global_scale):
 								    """Quantize BF16 tensor to NVFP4 using a custom CUDA kernel (GPU-only, no CPU sync).
 								    Replaces quantize_activation_nvfp4() which uses .amax() (CPU sync).
 								    The global_scale must be pre-computed (from warmup or known value).
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
+								    NOTE: Prefer quantize_nvfp4_gpu_fused() which also computes gsa on GPU.
 								    This function is kept for cases where global_scale is already known.
-												NVFP4-1.1 integration: GPU-only quantize kernel + MoE pipeline wiring

- Add quantize_nvfp4.cu: BF16→FP4 GPU kernel (no CPU sync, warp shuffle amax)
- Add quantize_nvfp4_gpu() bridge in ops/quantize.py
- Fix deinterleave_quantize kernel path (dsv4/ops/kernels → dsv4/kernels/cuda)
- Wire GPU quantize into Nvfp4MoE._run_impl():
  - L1 input: quantize_nvfp4_gpu (replaces quantize_activation_nvfp4)
  - Fused SwiGLU L2: deinterleave_quantize_nvfp4_cuda (single kernel)
  - Non-fused L2: quantize_nvfp4_gpu
- Add test_nvfp4_gpu_quantize.py for both kernels

											
										
										
											2026-05-25 16:19:04 +00:00
+								    Args:
 								        x_bf16: (M, N) BF16 tensor. N must be a multiple of 16.
 								        global_scale: float32 scalar (pre-computed, NOT from .max())
 								    Returns:
 								        x_fp4: (M, N//2) float4_e2m1fn_x2
 								        x_sf: (M, N//16) float8_e4m3fn
 								    """
-												P0 COMPLETE: Eliminate ALL .item() CPU-GPU syncs from NVFP4 activation path

Fused kernels (zero CPU sync, single kernel launch per projection):
- fused_amax_quantize.cu: amax→gsa→quantize in one pass. Replaces two-step
  compute_amax_gsa_gpu + quantize_nvfp4_gpu (had .item() sync).
- fused_deinterleave_amax_quantize.cu: Same for MoE fused_swiglu L2 path.
  Deinterleave + amax + quantize in one pass. Replaces compute_amax_gsa_gpu
  + deinterleave_quantize_nvfp4_cuda (had .item() sync).

All kernel loaders use dsv4/kernels/cuda/loader.py (compile-once cache).
Was JIT-compiling on every call via torch.utils.cpp_extension.load (~100ms/call,
~500 calls/token). Now compiles once and reuses the cached module.

Updated layers:
- linear.py Nvfp4Linear._run_impl: fused kernel, gsa via GPU buffer
- moe.py Nvfp4MoE._run_impl: fused for L1 and L2 (both fused_swiglu and
  non-fused paths)
- shared_expert.py: fused for L1 and L2
- quantize.py: All functions use module loader cache
- sampler.py: Uses module loader cache
- indexer/score_topk.py: Uses module loader cache

P2: Vectorized KVCache.append_swa — index_copy_ instead of Python loop.
2 kernel launches instead of 2T. No .item() in comp_pos either.

P3: Pre-allocated comp_kv buffers — O(1) append instead of O(N) torch.cat.
max_comp=32768 per layer (32MB). No more quadratic memory growth.

~486 .item() syncs per decoded token → ~0 (only argmax + token decode remain).

											
										
										
											2026-06-01 21:05:03 +00:00
+								    from dsv4.kernels.cuda.loader import get_cuda_module
 								    mod = get_cuda_module("quantize_nvfp4", ["quantize_nvfp4.cu"])
-												NVFP4-1.1 integration: GPU-only quantize kernel + MoE pipeline wiring

- Add quantize_nvfp4.cu: BF16→FP4 GPU kernel (no CPU sync, warp shuffle amax)
- Add quantize_nvfp4_gpu() bridge in ops/quantize.py
- Fix deinterleave_quantize kernel path (dsv4/ops/kernels → dsv4/kernels/cuda)
- Wire GPU quantize into Nvfp4MoE._run_impl():
  - L1 input: quantize_nvfp4_gpu (replaces quantize_activation_nvfp4)
  - Fused SwiGLU L2: deinterleave_quantize_nvfp4_cuda (single kernel)
  - Non-fused L2: quantize_nvfp4_gpu
- Add test_nvfp4_gpu_quantize.py for both kernels

											
										
										
											2026-05-25 16:19:04 +00:00
+								    return mod.quantize_nvfp4(x_bf16, global_scale)
-												P4: Fused RMSNorm + NVFP4 quantize kernel (2 launches vs 6+)

- fused_rmsnorm_quantize.cu: two-kernel approach
  Kernel 1: rmsnorm_amax_gsa — compute RMS + amax of normalized output → gsa per row
  Kernel 2: rmsnorm_quantize_nvfp4 — normalize + quantize using GPU-computed gsa
- Python bridge: rmsnorm_quantize_nvfp4() in ops/quantize.py
- Python bridge: dequantize_nvfp4() in ops/quantize.py
- Unit test: test_fused_rmsnorm_quantize.py (production shapes: 7168 hidden)
- Eliminates ~488 kernel launches per token (122 sites × 4 launches saved)

											
										
										
											2026-06-02 16:26:24 +00:00
-												P4: Add QuantizedActivation + Nvfp4Linear.run_from_quantized

- QuantizedActivation: carries (x_fp4, x_sf, gsa) for skip-quantize path
- Nvfp4Linear.run_from_quantized(): runs GEMM with pre-quantized input
- Enables fused RMSNorm+quantize to feed directly into all downstream
  linears (q_a, kv, o_proj, etc.) without re-quantizing

											
										
										
											2026-06-02 16:37:38 +00:00
+								class QuantizedActivation:
 								    """Pre-quantized NVFP4 activation tensor.
 								    Carries the FP4 data, block scales, and per-row global scale
 								    so downstream Nvfp4Linear calls can skip quantization and go
 								    straight to GEMM.
 								    Created by rmsnorm_quantize_nvfp4() or quantize_nvfp4_gpu_fused().
 								    Consumed by Nvfp4Linear.run_from_quantized().
 								    """
 								    __slots__ = ['x_fp4', 'x_sf', 'gsa', 'inv_rms', 'num_tokens']
 								    def __init__(self, x_fp4, x_sf, gsa, inv_rms=None):
 								        self.x_fp4 = x_fp4  # (M, N//2) FP4
 								        self.x_sf = x_sf    # (M, N//16) E4M3
 								        self.gsa = gsa      # (M,) FP32
 								        self.inv_rms = inv_rms  # (M,) FP32, optional
 								        self.num_tokens = x_fp4.shape[0]
-												P4: Fused RMSNorm + NVFP4 quantize kernel (2 launches vs 6+)

- fused_rmsnorm_quantize.cu: two-kernel approach
  Kernel 1: rmsnorm_amax_gsa — compute RMS + amax of normalized output → gsa per row
  Kernel 2: rmsnorm_quantize_nvfp4 — normalize + quantize using GPU-computed gsa
- Python bridge: rmsnorm_quantize_nvfp4() in ops/quantize.py
- Python bridge: dequantize_nvfp4() in ops/quantize.py
- Unit test: test_fused_rmsnorm_quantize.py (production shapes: 7168 hidden)
- Eliminates ~488 kernel launches per token (122 sites × 4 launches saved)

											
										
										
											2026-06-02 16:26:24 +00:00
+								def dequantize_nvfp4(x_fp4, x_sf, gsa, shape=None):
 								    """Dequantize NVFP4 → BF16 using the CUDA dequant kernel.
 								    Args:
 								        x_fp4: (M, N//2) FP4 packed
 								        x_sf: (M, N//16) E4M3 block scales
 								        gsa: (M,) or (M, 1) or (1,) FP32 global scale per row
 								        shape: unused, kept for API compat
 								    Returns:
 								        (M, N) BF16 tensor
 								    """
 								    from dsv4.kernels.cuda.loader import get_cuda_module
 								    mod = get_cuda_module("dequant_nvfp4", ["dequant_nvfp4.cu"])
 								    if gsa.dim() == 2:
 								        gsa = gsa.squeeze(1)  # (M, 1) → (M,)
-												P4: Fix dequantize_nvfp4 bridge — handle float8_e4m3fn dtype

											
										
										
											2026-06-02 16:31:56 +00:00
+								    # dequant kernel expects uint8 for both fp4 and sf
 								    if x_fp4.dtype != torch.uint8:
 								        x_fp4 = x_fp4.view(torch.uint8)
 								    if x_sf.dtype != torch.uint8:
 								        x_sf = x_sf.view(torch.uint8)
-												P4: Fused RMSNorm + NVFP4 quantize kernel (2 launches vs 6+)

- fused_rmsnorm_quantize.cu: two-kernel approach
  Kernel 1: rmsnorm_amax_gsa — compute RMS + amax of normalized output → gsa per row
  Kernel 2: rmsnorm_quantize_nvfp4 — normalize + quantize using GPU-computed gsa
- Python bridge: rmsnorm_quantize_nvfp4() in ops/quantize.py
- Python bridge: dequantize_nvfp4() in ops/quantize.py
- Unit test: test_fused_rmsnorm_quantize.py (production shapes: 7168 hidden)
- Eliminates ~488 kernel launches per token (122 sites × 4 launches saved)

											
										
										
											2026-06-02 16:26:24 +00:00
+								    return mod.dequant_nvfp4(x_fp4, x_sf, gsa)
-												P5: Fix mhc_rmsnorm_quantize_nvfp4 — add proper function definition

											
										
										
											2026-06-02 17:57:33 +00:00
+								def mhc_rmsnorm_quantize_nvfp4(X_l, A_l, norm_weight, eps=1e-6, divisor=6.0 * 448.0):
 								    """Fused mHC pre_block + RMSNorm + NVFP4 quantize: 2 kernel launches total.
 								    Replaces: bmm (1 launch) + rmsnorm (4+ launches) + quantize (2 launches)
 								    Total unfused: 7+ launches per site × 122 sites = 854+ launches/token
 								    Fused: 2 launches per site × 122 sites = 244 launches → 610 launches saved/token.
 								    Args:
 								        X_l: (M, n_hc, N) BF16 tensor. n_hc must be <= 4, N multiple of 16.
 								        A_l: (M, n_hc) BF16 tensor. Softmax weights from mHC._dynamic_params.
 								        norm_weight: (N,) FP32 RMSNorm weight.
 								        eps: RMSNorm epsilon (default 1e-6).
 								        divisor: gsa = amax / divisor. Default 6.0 * 448.0 = 2688.0.
 								    Returns:
 								        QuantizedActivation with x_fp4, x_sf, gsa, inv_rms
 								    """
-												P5: Fused mHC pre_block + RMSNorm + NVFP4 quantize kernel

- fused_mhc_rmsnorm_quantize.cu: 2-kernel approach
  Kernel 1: mhc_rmsnorm_amax_gsa — bmm + RMS + amax → gsa
  Kernel 2: mhc_rmsnorm_quantize_nvfp4 — bmm + normalize + quantize
- Python bridge: mhc_rmsnorm_quantize_nvfp4() in ops/quantize.py
- Unit test: test_fused_mhc_rmsnorm_quantize.py (production shapes)
- Eliminates ~610 kernel launches per token (122 sites × 5 launches saved)

											
										
										
											2026-06-02 16:39:42 +00:00
+								    from dsv4.kernels.cuda.loader import get_cuda_module
 								    mod = get_cuda_module("fused_mhc_rmsnorm_quantize", ["fused_mhc_rmsnorm_quantize.cu"])
 								    x_fp4, x_sf, gsa, inv_rms = mod.mhc_rmsnorm_quantize_nvfp4(X_l, A_l, norm_weight, eps, divisor)
 								    return QuantizedActivation(x_fp4, x_sf, gsa, inv_rms)
-												P4: Fused RMSNorm + NVFP4 quantize kernel (2 launches vs 6+)

- fused_rmsnorm_quantize.cu: two-kernel approach
  Kernel 1: rmsnorm_amax_gsa — compute RMS + amax of normalized output → gsa per row
  Kernel 2: rmsnorm_quantize_nvfp4 — normalize + quantize using GPU-computed gsa
- Python bridge: rmsnorm_quantize_nvfp4() in ops/quantize.py
- Python bridge: dequantize_nvfp4() in ops/quantize.py
- Unit test: test_fused_rmsnorm_quantize.py (production shapes: 7168 hidden)
- Eliminates ~488 kernel launches per token (122 sites × 4 launches saved)

											
										
										
											2026-06-02 16:26:24 +00:00
+								def rmsnorm_quantize_nvfp4(x_bf16, norm_weight, eps=1e-6, divisor=6.0 * 448.0):
 								    """Fused RMSNorm + amax + NVFP4 quantize: 2 kernel launches total.
 								    Replaces the unfused path:
 								      rmsnorm(x, weight) → 4+ BF16 launches
 								      quantize_nvfp4_gpu_fused(rmsnormed) → 2 kernel launches + amax
 								    Total unfused: 6+ launches per call × 122 calls/layer-step = 732+ launches/token
 								    Fused: 2 kernel launches per call × 122 calls = 244 launches → 488 launches saved/token.
 								    Two-kernel approach (correct cross-CTA reduction):
 								      Kernel 1: compute RMS + amax of normalized output → gsa per row (GPU buffer)
 								      Kernel 2: normalize + quantize using gsa from GPU buffer (no CPU sync)
 								    Args:
 								        x_bf16: (M, N) BF16 tensor. N must be a multiple of 16.
 								        norm_weight: (N,) FP32 RMSNorm weight.
 								        eps: RMSNorm epsilon (default 1e-6).
 								        divisor: gsa = amax / divisor. Default 6.0 * 448.0 = 2688.0.
 								    Returns:
 								        x_fp4: (M, N//2) FP4 packed (uint8 view of float4_e2m1fn_x2)
 								        x_sf: (M, N//16) E4M3 block scales
 								        gsa: (M,) FP32 per-row global scale for GEMM
 								        inv_rms: (M,) FP32 per-row 1/RMS (useful for downstream if needed)
 								    """
 								    from dsv4.kernels.cuda.loader import get_cuda_module
 								    mod = get_cuda_module("fused_rmsnorm_quantize", ["fused_rmsnorm_quantize.cu"])
 								    x_fp4, x_sf, gsa, inv_rms = mod.rmsnorm_quantize_nvfp4(x_bf16, norm_weight, eps, divisor)
-												P4: Fix rmsnorm_quantize_nvfp4 returns QuantizedActivation not tuple

											
										
										
											2026-06-02 17:43:21 +00:00
+								    return QuantizedActivation(x_fp4, x_sf, gsa, inv_rms)