diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py
index a8b5344c..9a9e2ae2 100644
--- a/dsv4/layers/shared_expert.py
+++ b/dsv4/layers/shared_expert.py
@@ -365,7 +365,15 @@ class Nvfp4SharedExpert:
             from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
             if not intermediate.is_contiguous():
                 intermediate = intermediate.contiguous()
-            x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
+            # DEBUG: sync before quantize to isolate which kernel fails
+            torch.cuda.synchronize()
+            try:
+                x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
+            except RuntimeError as e:
+                print(f"  SE L2 quantize FAILED: {e}", flush=True)
+                print(f"  intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True)
+                raise
+            torch.cuda.synchronize()  # DEBUG: catch async errors from quantize
             # Copy first element of gsa (scalar for single-expert) to pre-allocated buffer.
             # Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue
             # on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor).