Fix: wrap dummy MoE weights in nn.Parameter

PyTorch requires module attributes to be nn.Parameter or None.
torch.empty can't be assigned to a registered parameter slot.
This commit is contained in:
2026-05-19 04:21:35 +00:00
parent b06dcb40dc
commit f023b3b2c6

View File

@@ -148,12 +148,12 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular):
# apply() before delegating to our expert impl, so we can't set the
# weights to None. Instead, replace with a shape-preserving dummy on CPU
# to free GPU memory while keeping the shape metadata accessible.
layer.w13_weight = torch.empty(
layer.w13_weight = torch.nn.Parameter(torch.empty(
num_experts, 2 * intermediate_size, hidden_size // 2,
device='cpu', dtype=torch.uint8)
layer.w2_weight = torch.empty(
device='cpu', dtype=torch.uint8), requires_grad=False)
layer.w2_weight = torch.nn.Parameter(torch.empty(
num_experts, hidden_size, intermediate_size // 2,
device='cpu', dtype=torch.uint8)
device='cpu', dtype=torch.uint8), requires_grad=False)
layer.w13_weight_scale = None
layer.w2_weight_scale = None
layer.w13_weight_scale_2 = None