[Model] Add MoE support for NemotronH (#25863)
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
This commit is contained in:
@@ -411,11 +411,15 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
if self.moe.is_act_and_mul:
|
||||
w13_up_dim = 2 * intermediate_size_per_partition
|
||||
else:
|
||||
w13_up_dim = intermediate_size_per_partition
|
||||
# Fused gate_up_proj (column parallel)
|
||||
w13_weight = torch.nn.Parameter(
|
||||
torch.empty(
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
w13_up_dim,
|
||||
hidden_size,
|
||||
dtype=params_dtype,
|
||||
),
|
||||
@@ -425,9 +429,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
set_weight_attrs(w13_weight, extra_weight_attrs)
|
||||
if self.moe.has_bias:
|
||||
w13_bias = torch.nn.Parameter(
|
||||
torch.zeros(
|
||||
num_experts, 2 * intermediate_size_per_partition, dtype=params_dtype
|
||||
),
|
||||
torch.zeros(num_experts, w13_up_dim, dtype=params_dtype),
|
||||
requires_grad=False,
|
||||
)
|
||||
layer.register_parameter("w13_bias", w13_bias)
|
||||
@@ -1073,6 +1075,7 @@ class FusedMoE(CustomOp):
|
||||
e_score_correction_bias: torch.Tensor | None = None,
|
||||
apply_router_weight_on_input: bool = False,
|
||||
activation: str = "silu",
|
||||
is_act_and_mul: bool = True,
|
||||
enable_eplb: bool = False,
|
||||
num_redundant_experts: int = 0,
|
||||
has_bias: bool = False,
|
||||
@@ -1263,6 +1266,7 @@ class FusedMoE(CustomOp):
|
||||
in_dtype=moe_in_dtype,
|
||||
max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
|
||||
has_bias=has_bias,
|
||||
is_act_and_mul=is_act_and_mul,
|
||||
)
|
||||
self.moe_config = moe
|
||||
self.moe_quant_config: FusedMoEQuantConfig | None = None
|
||||
@@ -1283,6 +1287,24 @@ class FusedMoE(CustomOp):
|
||||
assert isinstance(quant_method, FusedMoEMethodBase)
|
||||
self.quant_method = quant_method
|
||||
|
||||
if not self.moe_config.is_act_and_mul:
|
||||
# Avoid circular import
|
||||
from vllm.model_executor.layers.quantization.modelopt import (
|
||||
ModelOptFp8MoEMethod,
|
||||
)
|
||||
|
||||
if not isinstance(
|
||||
quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
|
||||
):
|
||||
raise NotImplementedError(
|
||||
"is_act_and_mul=False is supported only for unquantized "
|
||||
"and ModelOpt FP8 moe for now"
|
||||
)
|
||||
if not current_platform.is_cuda():
|
||||
raise NotImplementedError(
|
||||
"is_act_and_mul=False is supported only for CUDA for now"
|
||||
)
|
||||
|
||||
if self.enable_eplb:
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8MoEMethod
|
||||
|
||||
@@ -1531,7 +1553,10 @@ class FusedMoE(CustomOp):
|
||||
):
|
||||
# Index the loaded weight for tp sharding.
|
||||
# gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
|
||||
shard_size = expert_data.shape[shard_dim] // 2
|
||||
if self.moe_config.is_act_and_mul:
|
||||
shard_size = expert_data.shape[shard_dim] // 2
|
||||
else:
|
||||
shard_size = expert_data.shape[shard_dim]
|
||||
if not load_full:
|
||||
loaded_weight = loaded_weight.narrow(
|
||||
shard_dim, shard_size * tp_rank, shard_size
|
||||
|
||||
Reference in New Issue
Block a user