Use 16 experts for MoE runner test (fits in memory)

This commit is contained in:
2026-05-19 18:35:40 +00:00
parent daa59a7c75
commit 152b0749df

View File

@@ -38,11 +38,13 @@ def rms(x, w, eps=1e-6):
return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
def pack_expert_weights(wm, G, layer_id=2, num_local_experts=384):
"""Pack per-expert weights into stacked format for CuTeDSLMoERunner."""
def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
"""Pack per-expert weights into stacked format for CuTeDSLMoERunner.
Only loads the first num_local_experts to fit in memory.
"""
m = f"model.layers.{layer_id}.mlp"
# Load all expert weights and stack
# Load expert weights and stack (only first num_local_experts)
gate_ws, gate_sfs, gate_gss = [], [], []
up_ws, up_sfs, up_gss = [], [], []
down_ws, down_sfs, down_gss = [], [], []
@@ -109,9 +111,8 @@ def test_moe_runner(layer_id=2):
fnorm = G(f"{p}.post_attention_layernorm.weight")
print(f" Packing expert weights (384 experts)...")
# This will take a while and use a LOT of memory
# Let's use fewer experts for testing
num_local_experts = 384
# Test with fewer experts to fit in memory
num_local_experts = 16 # Use 16 experts (out of 384) for testing
# Create the runner first, then prepare weights
intermediate_size = INTERMEDIATE # 3072
@@ -153,6 +154,7 @@ def test_moe_runner(layer_id=2):
normed = rms(hidden, fnorm, EPS)
topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV)
print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1)
print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")