[Feature] EPLB on Qwen3VLMoe and CompressedTensorsWNA16MoEMethod (#28849)

This commit is contained in:
JartX
2025-11-20 00:30:08 +01:00
committed by GitHub
parent 0075bfffd4
commit 8e38e99829
2 changed files with 82 additions and 7 deletions

View File

@@ -1921,9 +1921,20 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet."
)
if expert_load_view is None:
raise ValueError("enable_eplb=True requiere expert_load_view != None")
if logical_to_physical_map is None:
raise ValueError(
"enable_eplb=True requiere logical_to_physical_map != None"
)
if logical_replica_count is None:
raise ValueError(
"enable_eplb=True requiere logical_replica_count != None"
)
if not isinstance(layer, FusedMoE):
raise TypeError(
"EPLB is only supported when `layer` is a instance of FusedMoE."
)
from vllm.model_executor.layers.fused_moe import fused_experts
@@ -1940,6 +1951,12 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
enable_eplb=enable_eplb,
expert_map=expert_map,
expert_load_view=expert_load_view,
logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count,
)
return fused_experts(
@@ -1956,6 +1973,10 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
quant_config=self.moe_quant_config,
)
@property
def supports_eplb(self) -> bool:
return True
class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
"""