From 5c770c68caf44148cdd79dd9e7cea7535e088ef2 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 19 May 2026 04:50:31 +0000 Subject: [PATCH] Keep MoE scale tensors: framework warmup needs them The framework's deep_gemm_warmup calls get_fused_moe_quant_config which accesses w13_input_scale etc. Setting them to None caused TypeError: float / NoneType. Keep scales (small tensors) and only free the large weight tensors. --- vllm/patches/fused_moe/experts/cutedsl_moe.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/vllm/patches/fused_moe/experts/cutedsl_moe.py b/vllm/patches/fused_moe/experts/cutedsl_moe.py index 5c900e18..d8dfa93e 100644 --- a/vllm/patches/fused_moe/experts/cutedsl_moe.py +++ b/vllm/patches/fused_moe/experts/cutedsl_moe.py @@ -148,20 +148,15 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular): # apply() before delegating to our expert impl, so we can't set the # weights to None. Instead, replace with a shape-preserving dummy on CPU # to free GPU memory while keeping the shape metadata accessible. + # Free the large weight tensors — they're now in the runner. + # Keep the scale tensors (small) because the framework's warmup + # and quant config construction needs them. layer.w13_weight = torch.nn.Parameter(torch.empty( num_experts, 2 * intermediate_size, hidden_size // 2, device='cpu', dtype=torch.uint8), requires_grad=False) layer.w2_weight = torch.nn.Parameter(torch.empty( num_experts, hidden_size, intermediate_size // 2, device='cpu', dtype=torch.uint8), requires_grad=False) - layer.w13_weight_scale = None - layer.w2_weight_scale = None - layer.w13_weight_scale_2 = None - layer.w2_weight_scale_2 = None - if hasattr(layer, 'w13_input_scale'): - layer.w13_input_scale = None - if hasattr(layer, 'w2_input_scale'): - layer.w2_input_scale = None # Create the CuTeDSL runner self._runner = CuTeDSLMoERunner(