[Model] Apply shared experts overlap optimization to all models with shared experts (#26145)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-09 11:31:04 -04:00
parent 3b736e1c38
commit 47e66c24e2
15 changed files with 285 additions and 297 deletions
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -33,7 +33,7 @@ from vllm.distributed import (
    get_tensor_model_parallel_world_size,
    tensor_model_parallel_all_gather,
 )
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
    QKVParallelLinear,
@@ -42,7 +42,6 @@ from vllm.model_executor.layers.linear import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader,
    maybe_remap_kv_scale_name,
@@ -399,7 +398,7 @@ class Llama4Model(LlamaModel):
            params_dict: The dictionary of module parameters.
            loaded_params: The set of already loaded parameters.
            expert_params_mapping: The mapping of expert parameters. Must be
-                generated by FusedMoE.make_expert_params_mapping().
+                generated by SharedFusedMoE.make_expert_params_mapping().
            fused: Whether the expert weights are fused into a single weight
                tensor or are separate weight tensors for each expert.
                When fused is True, loaded_weight should have shape of:
@@ -522,7 +521,7 @@ class Llama4Model(LlamaModel):
        fused_experts_params = False
        # Expert parameter mapping for the case where the expert weights are
        # not fused into a single weight tensor.
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
            ckpt_gate_proj_name="gate_proj",
            ckpt_down_proj_name="down_proj",
            ckpt_up_proj_name="up_proj",
@@ -530,7 +529,7 @@ class Llama4Model(LlamaModel):
        )
        # Expert parameter mapping for the case where the expert weights are
        # fused into a single weight tensor.
-        expert_params_mapping_fused = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping_fused = SharedFusedMoE.make_expert_params_mapping(
            ckpt_gate_proj_name="gate_up_proj",
            ckpt_down_proj_name="down_proj",
            ckpt_up_proj_name="gate_up_proj",