From 5c770c68caf44148cdd79dd9e7cea7535e088ef2 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Tue, 19 May 2026 04:50:31 +0000
Subject: [PATCH] Keep MoE scale tensors: framework warmup needs them

The framework's deep_gemm_warmup calls get_fused_moe_quant_config
which accesses w13_input_scale etc. Setting them to None caused
TypeError: float / NoneType. Keep scales (small tensors) and only
free the large weight tensors.
---
 vllm/patches/fused_moe/experts/cutedsl_moe.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/vllm/patches/fused_moe/experts/cutedsl_moe.py b/vllm/patches/fused_moe/experts/cutedsl_moe.py
index 5c900e18..d8dfa93e 100644
--- a/vllm/patches/fused_moe/experts/cutedsl_moe.py
+++ b/vllm/patches/fused_moe/experts/cutedsl_moe.py
@@ -148,20 +148,15 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular):
         # apply() before delegating to our expert impl, so we can't set the
         # weights to None. Instead, replace with a shape-preserving dummy on CPU
         # to free GPU memory while keeping the shape metadata accessible.
+        # Free the large weight tensors — they're now in the runner.
+        # Keep the scale tensors (small) because the framework's warmup
+        # and quant config construction needs them.
         layer.w13_weight = torch.nn.Parameter(torch.empty(
             num_experts, 2 * intermediate_size, hidden_size // 2,
             device='cpu', dtype=torch.uint8), requires_grad=False)
         layer.w2_weight = torch.nn.Parameter(torch.empty(
             num_experts, hidden_size, intermediate_size // 2,
             device='cpu', dtype=torch.uint8), requires_grad=False)
-        layer.w13_weight_scale = None
-        layer.w2_weight_scale = None
-        layer.w13_weight_scale_2 = None
-        layer.w2_weight_scale_2 = None
-        if hasattr(layer, 'w13_input_scale'):
-            layer.w13_input_scale = None
-        if hasattr(layer, 'w2_input_scale'):
-            layer.w2_input_scale = None
 
         # Create the CuTeDSL runner
         self._runner = CuTeDSLMoERunner(