[BugFix] add select_gemm_impl on CompressedTensorsWNA16MoEMethod to support LoRA (#31453)

Signed-off-by: JartX <sagformas@epdcenter.es>
This commit is contained in:
JartX
2025-12-30 20:20:15 +01:00
committed by GitHub
parent 3f52fa5aa2
commit 07728bf5cd

View File

@@ -1996,6 +1996,29 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
block_shape=[0, self.group_size],
)
def select_gemm_impl(
self,
prepare_finalize: mk.FusedMoEPrepareAndFinalize,
layer: torch.nn.Module,
) -> mk.FusedMoEPermuteExpertsUnpermute:
if self.moe.is_lora_enabled:
assert self.moe_quant_config is not None
from vllm.triton_utils import HAS_TRITON
if HAS_TRITON:
from vllm.model_executor.layers.fused_moe import TritonExperts
layer.w13_weight = layer.w13_weight_packed
layer.w2_weight = layer.w2_weight_packed
return TritonExperts(quant_config=self.moe_quant_config)
else:
raise NotImplementedError(
"TritonExperts requires Triton. "
"Install triton or disable LoRA for MoE."
)
raise NotImplementedError
def apply(
self,
layer: FusedMoE,