diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py index 9a9e2ae2..1cef8373 100644 --- a/dsv4/layers/shared_expert.py +++ b/dsv4/layers/shared_expert.py @@ -365,18 +365,16 @@ class Nvfp4SharedExpert: from dsv4.ops.quantize import quantize_nvfp4_gpu_fused if not intermediate.is_contiguous(): intermediate = intermediate.contiguous() - # DEBUG: sync before quantize to isolate which kernel fails - torch.cuda.synchronize() + # DEBUG: isolate async CUDA error + torch.cuda.synchronize() # catch any prior async error + x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate) try: - x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate) + torch.cuda.synchronize() # catch error from quantize kernels except RuntimeError as e: - print(f" SE L2 quantize FAILED: {e}", flush=True) - print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True) + print(f" SE L2: quantize_nvfp4_gpu_fused FAILED after sync: {e}", flush=True) + print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device}", flush=True) raise - torch.cuda.synchronize() # DEBUG: catch async errors from quantize - # Copy first element of gsa (scalar for single-expert) to pre-allocated buffer. - # Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue - # on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor). + # Copy first element of gsa to pre-allocated buffer. self._l2_gsa_buf[0] = gsa_l2_gpu[0] # scalar GPU → GPU, no sync, graph-capturable else: x_fp4, x_sf = quantize_activation_nvfp4( diff --git a/dsv4/ops/quantize.py b/dsv4/ops/quantize.py index 3a189555..10a51388 100644 --- a/dsv4/ops/quantize.py +++ b/dsv4/ops/quantize.py @@ -334,6 +334,8 @@ def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0): # For M=1: gsa_gpu is (1,) contiguous — zero allocation quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"]) x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu) + # DEBUG: sync to catch async errors from the quantize kernels + torch.cuda.synchronize() return x_fp4, x_sf, gsa_gpu