Use 16 experts for MoE runner test (fits in memory)

2026-05-19 18:35:40 +00:00
parent daa59a7c75
commit 152b0749df
1 changed files with 8 additions and 6 deletions
--- a/tests/test_moe_runner_nan_b200.py
+++ b/tests/test_moe_runner_nan_b200.py
@@ -38,11 +38,13 @@ def rms(x, w, eps=1e-6):
    return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)


-def pack_expert_weights(wm, G, layer_id=2, num_local_experts=384):
-    """Pack per-expert weights into stacked format for CuTeDSLMoERunner."""
+def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
+    """Pack per-expert weights into stacked format for CuTeDSLMoERunner.
+    Only loads the first num_local_experts to fit in memory.
+    """
    m = f"model.layers.{layer_id}.mlp"
    
-    # Load all expert weights and stack
+    # Load expert weights and stack (only first num_local_experts)
    gate_ws, gate_sfs, gate_gss = [], [], []
    up_ws, up_sfs, up_gss = [], [], []
    down_ws, down_sfs, down_gss = [], [], []
@@ -109,9 +111,8 @@ def test_moe_runner(layer_id=2):
    fnorm = G(f"{p}.post_attention_layernorm.weight")
    
    print(f"  Packing expert weights (384 experts)...")
-    # This will take a while and use a LOT of memory
-    # Let's use fewer experts for testing
-    num_local_experts = 384
+    # Test with fewer experts to fit in memory
+    num_local_experts = 16  # Use 16 experts (out of 384) for testing
    
    # Create the runner first, then prepare weights
    intermediate_size = INTERMEDIATE  # 3072
@@ -153,6 +154,7 @@ def test_moe_runner(layer_id=2):
        normed = rms(hidden, fnorm, EPS)
        
        topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV)
+        print(f"  {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
        topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1)
        
        print(f"  {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")