[BugFix] add select_gemm_impl on CompressedTensorsWNA16MoEMethod to support LoRA (#31453)
Signed-off-by: JartX <sagformas@epdcenter.es>
This commit is contained in:
@@ -1996,6 +1996,29 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
|
||||
block_shape=[0, self.group_size],
|
||||
)
|
||||
|
||||
def select_gemm_impl(
|
||||
self,
|
||||
prepare_finalize: mk.FusedMoEPrepareAndFinalize,
|
||||
layer: torch.nn.Module,
|
||||
) -> mk.FusedMoEPermuteExpertsUnpermute:
|
||||
if self.moe.is_lora_enabled:
|
||||
assert self.moe_quant_config is not None
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
if HAS_TRITON:
|
||||
from vllm.model_executor.layers.fused_moe import TritonExperts
|
||||
|
||||
layer.w13_weight = layer.w13_weight_packed
|
||||
layer.w2_weight = layer.w2_weight_packed
|
||||
return TritonExperts(quant_config=self.moe_quant_config)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"TritonExperts requires Triton. "
|
||||
"Install triton or disable LoRA for MoE."
|
||||
)
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: FusedMoE,
|
||||
|
||||
Reference in New Issue
Block a user