Fix MoE w1=None crash: keep shape-preserving dummy weights on CPU

The modular kernel framework reads w1.shape[0] in its outer apply()
before delegating to our expert impl. Setting layer.w13_weight = None
caused AttributeError. Replace with shape-preserving CPU dummy tensors
to free GPU memory while keeping shape metadata accessible.
This commit is contained in:
2026-05-19 04:17:10 +00:00
parent c289c44920
commit b06dcb40dc

View File

@@ -144,8 +144,16 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular):
# We have views into the same memory (l1_fp4, l2_fp4), but the runner
# will create its own copies in _ensure_stacked. Free the layer refs
# now so the memory can be reclaimed when the views are no longer held.
layer.w13_weight = None
layer.w2_weight = None
# NOTE: The modular kernel framework reads w1.shape[0] in its outer
# apply() before delegating to our expert impl, so we can't set the
# weights to None. Instead, replace with a shape-preserving dummy on CPU
# to free GPU memory while keeping the shape metadata accessible.
layer.w13_weight = torch.empty(
num_experts, 2 * intermediate_size, hidden_size // 2,
device='cpu', dtype=torch.uint8)
layer.w2_weight = torch.empty(
num_experts, hidden_size, intermediate_size // 2,
device='cpu', dtype=torch.uint8)
layer.w13_weight_scale = None
layer.w2_weight_scale = None
layer.w13_weight_scale_2 = None