actually handle expert param mapping

This commit is contained in:
2026-05-15 06:01:50 +00:00
parent f17efa340d
commit 685bce48b4

View File

@@ -211,18 +211,19 @@ class DeepseekV4FP8Config(Fp8Config):
def make_deepseek_v4_expert_params_mapping(
num_experts: int,
) -> list[tuple[str, str, int, str]]:
# Checkpoint uses gate_proj/up_proj/down_proj, model params use w13_/w2_
return [
(
"experts.w13_" if shard_id in ("w1", "w3") else "experts.w2_",
f"experts.{expert_id}.{weight_name}.",
f"experts.{expert_id}.{ckpt_name}.",
expert_id,
shard_id,
)
for expert_id in range(num_experts)
for shard_id, weight_name in [
("w1", "w1"),
("w2", "w2"),
("w3", "w3"),
for shard_id, ckpt_name in [
("w1", "gate_proj"),
("w2", "down_proj"),
("w3", "up_proj"),
]
]