From b06dcb40dc7251ae5e55a09b63f4813d44f8a7e3 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 19 May 2026 04:17:10 +0000 Subject: [PATCH] Fix MoE w1=None crash: keep shape-preserving dummy weights on CPU The modular kernel framework reads w1.shape[0] in its outer apply() before delegating to our expert impl. Setting layer.w13_weight = None caused AttributeError. Replace with shape-preserving CPU dummy tensors to free GPU memory while keeping shape metadata accessible. --- vllm/patches/fused_moe/experts/cutedsl_moe.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/patches/fused_moe/experts/cutedsl_moe.py b/vllm/patches/fused_moe/experts/cutedsl_moe.py index 7580efc6..0f56b3d4 100644 --- a/vllm/patches/fused_moe/experts/cutedsl_moe.py +++ b/vllm/patches/fused_moe/experts/cutedsl_moe.py @@ -144,8 +144,16 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular): # We have views into the same memory (l1_fp4, l2_fp4), but the runner # will create its own copies in _ensure_stacked. Free the layer refs # now so the memory can be reclaimed when the views are no longer held. - layer.w13_weight = None - layer.w2_weight = None + # NOTE: The modular kernel framework reads w1.shape[0] in its outer + # apply() before delegating to our expert impl, so we can't set the + # weights to None. Instead, replace with a shape-preserving dummy on CPU + # to free GPU memory while keeping the shape metadata accessible. + layer.w13_weight = torch.empty( + num_experts, 2 * intermediate_size, hidden_size // 2, + device='cpu', dtype=torch.uint8) + layer.w2_weight = torch.empty( + num_experts, hidden_size, intermediate_size // 2, + device='cpu', dtype=torch.uint8) layer.w13_weight_scale = None layer.w2_weight_scale = None layer.w13_weight_scale_2 = None