DEBUG: add sync inside quantize_nvfp4_gpu_fused to catch async errors

2026-06-04 01:05:47 +00:00
parent 55def5eef9
commit e77455c3ba
2 changed files with 9 additions and 9 deletions
--- a/dsv4/layers/shared_expert.py
+++ b/dsv4/layers/shared_expert.py
@@ -365,18 +365,16 @@ class Nvfp4SharedExpert:
            from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
            if not intermediate.is_contiguous():
                intermediate = intermediate.contiguous()
-            # DEBUG: sync before quantize to isolate which kernel fails
-            torch.cuda.synchronize()
+            # DEBUG: isolate async CUDA error
+            torch.cuda.synchronize()  # catch any prior async error
+            x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
            try:
-                x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
+                torch.cuda.synchronize()  # catch error from quantize kernels
            except RuntimeError as e:
-                print(f"  SE L2 quantize FAILED: {e}", flush=True)
-                print(f"  intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True)
+                print(f"  SE L2: quantize_nvfp4_gpu_fused FAILED after sync: {e}", flush=True)
+                print(f"  intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device}", flush=True)
                raise
-            torch.cuda.synchronize()  # DEBUG: catch async errors from quantize
-            # Copy first element of gsa (scalar for single-expert) to pre-allocated buffer.
-            # Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue
-            # on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor).
+            # Copy first element of gsa to pre-allocated buffer.
            self._l2_gsa_buf[0] = gsa_l2_gpu[0]  # scalar GPU → GPU, no sync, graph-capturable
        else:
            x_fp4, x_sf = quantize_activation_nvfp4(
--- a/dsv4/ops/quantize.py
+++ b/dsv4/ops/quantize.py
@@ -334,6 +334,8 @@ def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0):
    # For M=1: gsa_gpu is (1,) contiguous — zero allocation
    quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
    x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu)
+    # DEBUG: sync to catch async errors from the quantize kernels
+    torch.cuda.synchronize()
    return x_fp4, x_sf, gsa_gpu