DEBUG: add sync inside quantize_nvfp4_gpu_fused to catch async errors

This commit is contained in:
2026-06-04 01:05:47 +00:00
parent 55def5eef9
commit e77455c3ba
2 changed files with 9 additions and 9 deletions

View File

@@ -365,18 +365,16 @@ class Nvfp4SharedExpert:
from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
if not intermediate.is_contiguous():
intermediate = intermediate.contiguous()
# DEBUG: sync before quantize to isolate which kernel fails
torch.cuda.synchronize()
# DEBUG: isolate async CUDA error
torch.cuda.synchronize() # catch any prior async error
x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
try:
x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
torch.cuda.synchronize() # catch error from quantize kernels
except RuntimeError as e:
print(f" SE L2 quantize FAILED: {e}", flush=True)
print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True)
print(f" SE L2: quantize_nvfp4_gpu_fused FAILED after sync: {e}", flush=True)
print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device}", flush=True)
raise
torch.cuda.synchronize() # DEBUG: catch async errors from quantize
# Copy first element of gsa (scalar for single-expert) to pre-allocated buffer.
# Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue
# on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor).
# Copy first element of gsa to pre-allocated buffer.
self._l2_gsa_buf[0] = gsa_l2_gpu[0] # scalar GPU → GPU, no sync, graph-capturable
else:
x_fp4, x_sf = quantize_activation_nvfp4(

View File

@@ -334,6 +334,8 @@ def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0):
# For M=1: gsa_gpu is (1,) contiguous — zero allocation
quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu)
# DEBUG: sync to catch async errors from the quantize kernels
torch.cuda.synchronize()
return x_fp4, x_sf, gsa_gpu