diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py index a8b5344c..9a9e2ae2 100644 --- a/dsv4/layers/shared_expert.py +++ b/dsv4/layers/shared_expert.py @@ -365,7 +365,15 @@ class Nvfp4SharedExpert: from dsv4.ops.quantize import quantize_nvfp4_gpu_fused if not intermediate.is_contiguous(): intermediate = intermediate.contiguous() - x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate) + # DEBUG: sync before quantize to isolate which kernel fails + torch.cuda.synchronize() + try: + x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate) + except RuntimeError as e: + print(f" SE L2 quantize FAILED: {e}", flush=True) + print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True) + raise + torch.cuda.synchronize() # DEBUG: catch async errors from quantize # Copy first element of gsa (scalar for single-expert) to pre-allocated buffer. # Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue # on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor).