From 07728bf5cd7165972f89e52e8b31ca28576262ec Mon Sep 17 00:00:00 2001 From: JartX Date: Tue, 30 Dec 2025 20:20:15 +0100 Subject: [PATCH] [BugFix] add select_gemm_impl on CompressedTensorsWNA16MoEMethod to support LoRA (#31453) Signed-off-by: JartX --- .../compressed_tensors_moe.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 9b9a0858a..1094d9d55 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1996,6 +1996,29 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): block_shape=[0, self.group_size], ) + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + if self.moe.is_lora_enabled: + assert self.moe_quant_config is not None + from vllm.triton_utils import HAS_TRITON + + if HAS_TRITON: + from vllm.model_executor.layers.fused_moe import TritonExperts + + layer.w13_weight = layer.w13_weight_packed + layer.w2_weight = layer.w2_weight_packed + return TritonExperts(quant_config=self.moe_quant_config) + else: + raise NotImplementedError( + "TritonExperts requires Triton. " + "Install triton or disable LoRA for MoE." + ) + + raise NotImplementedError + def apply( self, layer: FusedMoE,