diff --git a/tests/test_moe_runner_nan_b200.py b/tests/test_moe_runner_nan_b200.py index 97e7ebe4..9f279bcc 100644 --- a/tests/test_moe_runner_nan_b200.py +++ b/tests/test_moe_runner_nan_b200.py @@ -38,11 +38,13 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) -def pack_expert_weights(wm, G, layer_id=2, num_local_experts=384): - """Pack per-expert weights into stacked format for CuTeDSLMoERunner.""" +def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16): + """Pack per-expert weights into stacked format for CuTeDSLMoERunner. + Only loads the first num_local_experts to fit in memory. + """ m = f"model.layers.{layer_id}.mlp" - # Load all expert weights and stack + # Load expert weights and stack (only first num_local_experts) gate_ws, gate_sfs, gate_gss = [], [], [] up_ws, up_sfs, up_gss = [], [], [] down_ws, down_sfs, down_gss = [], [], [] @@ -109,9 +111,8 @@ def test_moe_runner(layer_id=2): fnorm = G(f"{p}.post_attention_layernorm.weight") print(f" Packing expert weights (384 experts)...") - # This will take a while and use a LOT of memory - # Let's use fewer experts for testing - num_local_experts = 384 + # Test with fewer experts to fit in memory + num_local_experts = 16 # Use 16 experts (out of 384) for testing # Create the runner first, then prepare weights intermediate_size = INTERMEDIATE # 3072 @@ -153,6 +154,7 @@ def test_moe_runner(layer_id=2): normed = rms(hidden, fnorm, EPS) topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV) + print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}") topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1) print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")