diff --git a/tests/test_moe_runner_nan_b200.py b/tests/test_moe_runner_nan_b200.py
index 97e7ebe4..9f279bcc 100644
--- a/tests/test_moe_runner_nan_b200.py
+++ b/tests/test_moe_runner_nan_b200.py
@@ -38,11 +38,13 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 
-def pack_expert_weights(wm, G, layer_id=2, num_local_experts=384):
-    """Pack per-expert weights into stacked format for CuTeDSLMoERunner."""
+def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
+    """Pack per-expert weights into stacked format for CuTeDSLMoERunner.
+    Only loads the first num_local_experts to fit in memory.
+    """
     m = f"model.layers.{layer_id}.mlp"
     
-    # Load all expert weights and stack
+    # Load expert weights and stack (only first num_local_experts)
     gate_ws, gate_sfs, gate_gss = [], [], []
     up_ws, up_sfs, up_gss = [], [], []
     down_ws, down_sfs, down_gss = [], [], []
@@ -109,9 +111,8 @@ def test_moe_runner(layer_id=2):
     fnorm = G(f"{p}.post_attention_layernorm.weight")
     
     print(f"  Packing expert weights (384 experts)...")
-    # This will take a while and use a LOT of memory
-    # Let's use fewer experts for testing
-    num_local_experts = 384
+    # Test with fewer experts to fit in memory
+    num_local_experts = 16  # Use 16 experts (out of 384) for testing
     
     # Create the runner first, then prepare weights
     intermediate_size = INTERMEDIATE  # 3072
@@ -153,6 +154,7 @@ def test_moe_runner(layer_id=2):
         normed = rms(hidden, fnorm, EPS)
         
         topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV)
+        print(f"  {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
         topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1)
         
         print(f"  {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")