Relax Transformers modeling backend MoE experts check (#28952)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -256,7 +256,14 @@ class MoEMixin(MixtureOfExperts):
|
||||
def _recursive_replace(module: nn.Module, prefix: str):
|
||||
for child_name, child_module in module.named_children():
|
||||
qual_name = maybe_prefix(prefix, child_name)
|
||||
if child_name == "experts" and isinstance(child_module, nn.ModuleList):
|
||||
# Naive implementations will have experts as ModuleList
|
||||
is_modulelist = isinstance(child_module, nn.ModuleList)
|
||||
# Packed implementations will have experts as 3D tensors of shapes like:
|
||||
# gate_up_proj = (num_experts, 2 * intermediate_size, hidden_size)
|
||||
# down_proj = (num_experts, intermediate_size, hidden_size)
|
||||
params = list(child_module.parameters())
|
||||
is_3d = len(params) > 0 and all(p.ndim == 3 for p in params)
|
||||
if child_name == "experts" and (is_modulelist or is_3d):
|
||||
# Alias for readability
|
||||
mlp = module
|
||||
experts = child_module
|
||||
|
||||
Reference in New Issue
Block a user