DEBUG: isolate which kernel causes cudaErrorInvalidValue in SE L2 path

2026-06-04 00:41:28 +00:00
parent b314fde9b7
commit 5e3ced0b60
1 changed files with 9 additions and 1 deletions
--- a/dsv4/layers/shared_expert.py
+++ b/dsv4/layers/shared_expert.py
@@ -365,7 +365,15 @@ class Nvfp4SharedExpert:
            from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
            if not intermediate.is_contiguous():
                intermediate = intermediate.contiguous()
-            x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
+            # DEBUG: sync before quantize to isolate which kernel fails
+            torch.cuda.synchronize()
+            try:
+                x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
+            except RuntimeError as e:
+                print(f"  SE L2 quantize FAILED: {e}", flush=True)
+                print(f"  intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True)
+                raise
+            torch.cuda.synchronize()  # DEBUG: catch async errors from quantize
            # Copy first element of gsa (scalar for single-expert) to pre-allocated buffer.
            # Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue
            # on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor).