From 5ac151d0a59598b6c68e0b3561929bdaf932efb3 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 12 May 2026 13:10:32 +0000 Subject: [PATCH] debug: print tensor dtypes/shapes at C++ call boundary in fp8_nvfp4_mega_moe --- deep_gemm/mega/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/deep_gemm/mega/__init__.py b/deep_gemm/mega/__init__.py index ef9a4a3..ca60573 100644 --- a/deep_gemm/mega/__init__.py +++ b/deep_gemm/mega/__init__.py @@ -317,6 +317,16 @@ def fp8_nvfp4_mega_moe(y: torch.Tensor, Activation format: E2M1 packed uint8 + UE4M3 scales (computed by staging kernel) Recipe: (1, 1, 16) — kGranK=16 for NVFP4 group_size=16. """ + for name, t in [("l1_w", l1_weights), ("l1_w_sf", l1_weights_sf), + ("l2_w", l2_weights), ("l2_w_sf", l2_weights_sf)]: + print(f"[debug] {name}: dtype={t.dtype} shape={tuple(t.shape)} contig={t.is_contiguous()}", flush=True) + + # Also check symm buffer views + for name, t in [("sym_x", sym_buffer.x), ("sym_x_sf", sym_buffer.x_sf), + ("sym_l1_acts", sym_buffer.l1_acts), ("sym_l1_acts_sf", sym_buffer.l1_acts_sf), + ("sym_l2_acts", sym_buffer.l2_acts), ("sym_l2_acts_sf", sym_buffer.l2_acts_sf)]: + print(f"[debug] {name}: dtype={t.dtype} shape={tuple(t.shape)} contig={t.is_contiguous()}", flush=True) + _C.fp8_nvfp4_mega_moe( y, l1_weights, l2_weights,