From 7b3a85346513dd0856be8859ffef06e602cf4dcc Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Fri, 15 May 2026 07:10:13 +0000
Subject: [PATCH] more debugging

---
 src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/kernel.py | 2 +-
 src/nvfp4_megamoe_kernel/nvfp4_mega_moe.py            | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/kernel.py b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/kernel.py
index 5c8b3ab2..f1262d77 100644
--- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/kernel.py
+++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/kernel.py
@@ -76,7 +76,7 @@ def cutlass_grouped_nvfp4_gemm(
         M_expert = token_indices.shape[0]
         
         # DEBUG: verify data going into GEMM
-        if MEGA_MOE_DEBUG and e < 3 and M_expert > 0:
+        if e < 3 and M_expert > 0:
             print(f"[GEMM-IN] expert={e} M={M_expert} N={N} K={K} "
                   f"w shape={expert_w.shape} w_sf shape={expert_w_sf.shape} "
                   f"w absmax={expert_w.view(torch.int8).abs().max().item()} "
diff --git a/src/nvfp4_megamoe_kernel/nvfp4_mega_moe.py b/src/nvfp4_megamoe_kernel/nvfp4_mega_moe.py
index 63ab1c21..971f33a2 100644
--- a/src/nvfp4_megamoe_kernel/nvfp4_mega_moe.py
+++ b/src/nvfp4_megamoe_kernel/nvfp4_mega_moe.py
@@ -306,6 +306,15 @@ def nvfp4_mega_moe_full(
     l1_w, l1_sf = transformed_l1_weights
     l2_w, l2_sf = transformed_l2_weights
 
+    # Expert sanity check — are experts actually distinct?
+    if not getattr(self, '_expert_sanity', False):
+        self._expert_sanity = True
+        for e in range(min(4, l1_w.shape[0])):
+            w_sample = l1_w[e].view(torch.uint8)[:8, :8]
+            sf_sample = l1_sf[e].to(torch.float32)[:4, :4]
+            print(f"[EXPERT-SANITY e={e}] w_bytes[:8,:8]={w_sample.flatten().tolist()[:16]}")
+            print(f"[EXPERT-SANITY e={e}] sf[:4,:4]={sf_sample.flatten().tolist()[:8]}")
+
     # Step 1: Read staged activation from symm_buffer
     x_fp4 = symm_buffer.x[:num_tokens]
     x_sf = symm_buffer.x_sf[:num_tokens]