DEBUG: add sync inside quantize_nvfp4_gpu_fused to catch async errors
This commit is contained in:
@@ -365,18 +365,16 @@ class Nvfp4SharedExpert:
|
||||
from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
|
||||
if not intermediate.is_contiguous():
|
||||
intermediate = intermediate.contiguous()
|
||||
# DEBUG: sync before quantize to isolate which kernel fails
|
||||
torch.cuda.synchronize()
|
||||
# DEBUG: isolate async CUDA error
|
||||
torch.cuda.synchronize() # catch any prior async error
|
||||
x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
|
||||
try:
|
||||
x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
|
||||
torch.cuda.synchronize() # catch error from quantize kernels
|
||||
except RuntimeError as e:
|
||||
print(f" SE L2 quantize FAILED: {e}", flush=True)
|
||||
print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True)
|
||||
print(f" SE L2: quantize_nvfp4_gpu_fused FAILED after sync: {e}", flush=True)
|
||||
print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device}", flush=True)
|
||||
raise
|
||||
torch.cuda.synchronize() # DEBUG: catch async errors from quantize
|
||||
# Copy first element of gsa (scalar for single-expert) to pre-allocated buffer.
|
||||
# Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue
|
||||
# on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor).
|
||||
# Copy first element of gsa to pre-allocated buffer.
|
||||
self._l2_gsa_buf[0] = gsa_l2_gpu[0] # scalar GPU → GPU, no sync, graph-capturable
|
||||
else:
|
||||
x_fp4, x_sf = quantize_activation_nvfp4(
|
||||
|
||||
@@ -334,6 +334,8 @@ def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0):
|
||||
# For M=1: gsa_gpu is (1,) contiguous — zero allocation
|
||||
quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
|
||||
x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu)
|
||||
# DEBUG: sync to catch async errors from the quantize kernels
|
||||
torch.cuda.synchronize()
|
||||
return x_fp4, x_sf, gsa_gpu
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user