Use 16 experts for MoE runner test (fits in memory)
This commit is contained in:
@@ -38,11 +38,13 @@ def rms(x, w, eps=1e-6):
|
||||
return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
|
||||
|
||||
|
||||
def pack_expert_weights(wm, G, layer_id=2, num_local_experts=384):
|
||||
"""Pack per-expert weights into stacked format for CuTeDSLMoERunner."""
|
||||
def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
|
||||
"""Pack per-expert weights into stacked format for CuTeDSLMoERunner.
|
||||
Only loads the first num_local_experts to fit in memory.
|
||||
"""
|
||||
m = f"model.layers.{layer_id}.mlp"
|
||||
|
||||
# Load all expert weights and stack
|
||||
# Load expert weights and stack (only first num_local_experts)
|
||||
gate_ws, gate_sfs, gate_gss = [], [], []
|
||||
up_ws, up_sfs, up_gss = [], [], []
|
||||
down_ws, down_sfs, down_gss = [], [], []
|
||||
@@ -109,9 +111,8 @@ def test_moe_runner(layer_id=2):
|
||||
fnorm = G(f"{p}.post_attention_layernorm.weight")
|
||||
|
||||
print(f" Packing expert weights (384 experts)...")
|
||||
# This will take a while and use a LOT of memory
|
||||
# Let's use fewer experts for testing
|
||||
num_local_experts = 384
|
||||
# Test with fewer experts to fit in memory
|
||||
num_local_experts = 16 # Use 16 experts (out of 384) for testing
|
||||
|
||||
# Create the runner first, then prepare weights
|
||||
intermediate_size = INTERMEDIATE # 3072
|
||||
@@ -153,6 +154,7 @@ def test_moe_runner(layer_id=2):
|
||||
normed = rms(hidden, fnorm, EPS)
|
||||
|
||||
topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV)
|
||||
print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
|
||||
topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1)
|
||||
|
||||
print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
|
||||
|
||||
Reference in New Issue
Block a user