actually handle expert param mapping
This commit is contained in:
@@ -211,18 +211,19 @@ class DeepseekV4FP8Config(Fp8Config):
|
||||
def make_deepseek_v4_expert_params_mapping(
|
||||
num_experts: int,
|
||||
) -> list[tuple[str, str, int, str]]:
|
||||
# Checkpoint uses gate_proj/up_proj/down_proj, model params use w13_/w2_
|
||||
return [
|
||||
(
|
||||
"experts.w13_" if shard_id in ("w1", "w3") else "experts.w2_",
|
||||
f"experts.{expert_id}.{weight_name}.",
|
||||
f"experts.{expert_id}.{ckpt_name}.",
|
||||
expert_id,
|
||||
shard_id,
|
||||
)
|
||||
for expert_id in range(num_experts)
|
||||
for shard_id, weight_name in [
|
||||
("w1", "w1"),
|
||||
("w2", "w2"),
|
||||
("w3", "w3"),
|
||||
for shard_id, ckpt_name in [
|
||||
("w1", "gate_proj"),
|
||||
("w2", "down_proj"),
|
||||
("w3", "up_proj"),
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user