Keep MoE scale tensors: framework warmup needs them

The framework's deep_gemm_warmup calls get_fused_moe_quant_config
which accesses w13_input_scale etc. Setting them to None caused
TypeError: float / NoneType. Keep scales (small tensors) and only
free the large weight tensors.
This commit is contained in:
2026-05-19 04:50:31 +00:00
parent e0f385ac45
commit 5c770c68ca

View File

@@ -148,20 +148,15 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular):
# apply() before delegating to our expert impl, so we can't set the
# weights to None. Instead, replace with a shape-preserving dummy on CPU
# to free GPU memory while keeping the shape metadata accessible.
# Free the large weight tensors — they're now in the runner.
# Keep the scale tensors (small) because the framework's warmup
# and quant config construction needs them.
layer.w13_weight = torch.nn.Parameter(torch.empty(
num_experts, 2 * intermediate_size, hidden_size // 2,
device='cpu', dtype=torch.uint8), requires_grad=False)
layer.w2_weight = torch.nn.Parameter(torch.empty(
num_experts, hidden_size, intermediate_size // 2,
device='cpu', dtype=torch.uint8), requires_grad=False)
layer.w13_weight_scale = None
layer.w2_weight_scale = None
layer.w13_weight_scale_2 = None
layer.w2_weight_scale_2 = None
if hasattr(layer, 'w13_input_scale'):
layer.w13_input_scale = None
if hasattr(layer, 'w2_input_scale'):
layer.w2_input_scale = None
# Create the CuTeDSL runner
self._runner = CuTeDSLMoERunner(