diff --git a/vllm/patches/fused_moe/experts/cutedsl_moe.py b/vllm/patches/fused_moe/experts/cutedsl_moe.py index 0f56b3d4..5a8398ad 100644 --- a/vllm/patches/fused_moe/experts/cutedsl_moe.py +++ b/vllm/patches/fused_moe/experts/cutedsl_moe.py @@ -148,12 +148,12 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular): # apply() before delegating to our expert impl, so we can't set the # weights to None. Instead, replace with a shape-preserving dummy on CPU # to free GPU memory while keeping the shape metadata accessible. - layer.w13_weight = torch.empty( + layer.w13_weight = torch.nn.Parameter(torch.empty( num_experts, 2 * intermediate_size, hidden_size // 2, - device='cpu', dtype=torch.uint8) - layer.w2_weight = torch.empty( + device='cpu', dtype=torch.uint8), requires_grad=False) + layer.w2_weight = torch.nn.Parameter(torch.empty( num_experts, hidden_size, intermediate_size // 2, - device='cpu', dtype=torch.uint8) + device='cpu', dtype=torch.uint8), requires_grad=False) layer.w13_weight_scale = None layer.w2_weight_scale = None layer.w13_weight_scale_2 = None