DEBUG: isolate which kernel causes cudaErrorInvalidValue in SE L2 path

This commit is contained in:
2026-06-04 00:41:28 +00:00
parent b314fde9b7
commit 5e3ced0b60

View File

@@ -365,7 +365,15 @@ class Nvfp4SharedExpert:
from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
if not intermediate.is_contiguous():
intermediate = intermediate.contiguous()
x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
# DEBUG: sync before quantize to isolate which kernel fails
torch.cuda.synchronize()
try:
x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
except RuntimeError as e:
print(f" SE L2 quantize FAILED: {e}", flush=True)
print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True)
raise
torch.cuda.synchronize() # DEBUG: catch async errors from quantize
# Copy first element of gsa (scalar for single-expert) to pre-allocated buffer.
# Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue
# on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor).