diff --git a/vllm/patches/fused_moe/experts/cutedsl_moe.py b/vllm/patches/fused_moe/experts/cutedsl_moe.py index 5c900e18..d8dfa93e 100644 --- a/vllm/patches/fused_moe/experts/cutedsl_moe.py +++ b/vllm/patches/fused_moe/experts/cutedsl_moe.py @@ -148,20 +148,15 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular): # apply() before delegating to our expert impl, so we can't set the # weights to None. Instead, replace with a shape-preserving dummy on CPU # to free GPU memory while keeping the shape metadata accessible. + # Free the large weight tensors — they're now in the runner. + # Keep the scale tensors (small) because the framework's warmup + # and quant config construction needs them. layer.w13_weight = torch.nn.Parameter(torch.empty( num_experts, 2 * intermediate_size, hidden_size // 2, device='cpu', dtype=torch.uint8), requires_grad=False) layer.w2_weight = torch.nn.Parameter(torch.empty( num_experts, hidden_size, intermediate_size // 2, device='cpu', dtype=torch.uint8), requires_grad=False) - layer.w13_weight_scale = None - layer.w2_weight_scale = None - layer.w13_weight_scale_2 = None - layer.w2_weight_scale_2 = None - if hasattr(layer, 'w13_input_scale'): - layer.w13_input_scale = None - if hasattr(layer, 'w2_input_scale'): - layer.w2_input_scale = None # Create the CuTeDSL runner self._runner = CuTeDSLMoERunner(