diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py
index 9a9e2ae2..1cef8373 100644
--- a/dsv4/layers/shared_expert.py
+++ b/dsv4/layers/shared_expert.py
@@ -365,18 +365,16 @@ class Nvfp4SharedExpert:
             from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
             if not intermediate.is_contiguous():
                 intermediate = intermediate.contiguous()
-            # DEBUG: sync before quantize to isolate which kernel fails
-            torch.cuda.synchronize()
+            # DEBUG: isolate async CUDA error
+            torch.cuda.synchronize()  # catch any prior async error
+            x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
             try:
-                x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
+                torch.cuda.synchronize()  # catch error from quantize kernels
             except RuntimeError as e:
-                print(f"  SE L2 quantize FAILED: {e}", flush=True)
-                print(f"  intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True)
+                print(f"  SE L2: quantize_nvfp4_gpu_fused FAILED after sync: {e}", flush=True)
+                print(f"  intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device}", flush=True)
                 raise
-            torch.cuda.synchronize()  # DEBUG: catch async errors from quantize
-            # Copy first element of gsa (scalar for single-expert) to pre-allocated buffer.
-            # Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue
-            # on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor).
+            # Copy first element of gsa to pre-allocated buffer.
             self._l2_gsa_buf[0] = gsa_l2_gpu[0]  # scalar GPU → GPU, no sync, graph-capturable
         else:
             x_fp4, x_sf = quantize_activation_nvfp4(
diff --git a/dsv4/ops/quantize.py b/dsv4/ops/quantize.py
index 3a189555..10a51388 100644
--- a/dsv4/ops/quantize.py
+++ b/dsv4/ops/quantize.py
@@ -334,6 +334,8 @@ def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0):
     # For M=1: gsa_gpu is (1,) contiguous — zero allocation
     quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
     x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu)
+    # DEBUG: sync to catch async errors from the quantize kernels
+    torch.cuda.synchronize()
     return x_fp4, x_sf, gsa_gpu