diff --git a/dsv4/layers/moe.py b/dsv4/layers/moe.py index 0c682805..38ed050a 100644 --- a/dsv4/layers/moe.py +++ b/dsv4/layers/moe.py @@ -456,6 +456,8 @@ class Nvfp4MoE: # Phase 2: Full-buffer swizzle (no CPU sync, no Python loops) # During graph capture, Python view ops (reshape, transpose) are not allowed. # Use CUDA swizzle kernel instead. + rows = padded_x_sf.shape[0] + cols = padded_x_sf.shape[1] if torch.cuda.is_current_stream_capturing(): from dsv4.kernels.cuda.loader import get_cuda_module mod = get_cuda_module("blackwell_swizzle", ["blackwell_swizzle.cu"])