[Model] Apply shared experts overlap optimization to all models with shared experts (#26145)
Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
@@ -33,7 +33,7 @@ from vllm.distributed import (
|
||||
get_tensor_model_parallel_world_size,
|
||||
tensor_model_parallel_all_gather,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.fused_moe import SharedFusedMoE
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import (
|
||||
QKVParallelLinear,
|
||||
@@ -42,7 +42,6 @@ from vllm.model_executor.layers.linear import (
|
||||
)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
default_weight_loader,
|
||||
maybe_remap_kv_scale_name,
|
||||
@@ -399,7 +398,7 @@ class Llama4Model(LlamaModel):
|
||||
params_dict: The dictionary of module parameters.
|
||||
loaded_params: The set of already loaded parameters.
|
||||
expert_params_mapping: The mapping of expert parameters. Must be
|
||||
generated by FusedMoE.make_expert_params_mapping().
|
||||
generated by SharedFusedMoE.make_expert_params_mapping().
|
||||
fused: Whether the expert weights are fused into a single weight
|
||||
tensor or are separate weight tensors for each expert.
|
||||
When fused is True, loaded_weight should have shape of:
|
||||
@@ -522,7 +521,7 @@ class Llama4Model(LlamaModel):
|
||||
fused_experts_params = False
|
||||
# Expert parameter mapping for the case where the expert weights are
|
||||
# not fused into a single weight tensor.
|
||||
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
||||
expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
|
||||
ckpt_gate_proj_name="gate_proj",
|
||||
ckpt_down_proj_name="down_proj",
|
||||
ckpt_up_proj_name="up_proj",
|
||||
@@ -530,7 +529,7 @@ class Llama4Model(LlamaModel):
|
||||
)
|
||||
# Expert parameter mapping for the case where the expert weights are
|
||||
# fused into a single weight tensor.
|
||||
expert_params_mapping_fused = FusedMoE.make_expert_params_mapping(
|
||||
expert_params_mapping_fused = SharedFusedMoE.make_expert_params_mapping(
|
||||
ckpt_gate_proj_name="gate_up_proj",
|
||||
ckpt_down_proj_name="down_proj",
|
||||
ckpt_up_proj_name="gate_up_proj",
|
||||
|
||||
Reference in New Issue
Block a user