DEBUG: isolate which kernel causes cudaErrorInvalidValue in SE L2 path
This commit is contained in:
@@ -365,7 +365,15 @@ class Nvfp4SharedExpert:
|
||||
from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
|
||||
if not intermediate.is_contiguous():
|
||||
intermediate = intermediate.contiguous()
|
||||
x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
|
||||
# DEBUG: sync before quantize to isolate which kernel fails
|
||||
torch.cuda.synchronize()
|
||||
try:
|
||||
x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
|
||||
except RuntimeError as e:
|
||||
print(f" SE L2 quantize FAILED: {e}", flush=True)
|
||||
print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device} contiguous={intermediate.is_contiguous()}", flush=True)
|
||||
raise
|
||||
torch.cuda.synchronize() # DEBUG: catch async errors from quantize
|
||||
# Copy first element of gsa (scalar for single-expert) to pre-allocated buffer.
|
||||
# Using scalar assignment avoids copy_() from view which caused cudaErrorInvalidValue
|
||||
# on non-contiguous gsa_gpu slices (gsa_gpu[:1].reshape(1) — view of expanded tensor).
|
||||
|
||||
Reference in New Issue
Block a user