Add support for LoRA adapters in Nemotron-H models (#30802)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
This commit is contained in:
@@ -227,6 +227,11 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
|
||||
)
|
||||
|
||||
def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
|
||||
# No support for LoRA in flashinfer_cutlass_fused_moe.
|
||||
# See TODOs in flashinfer functions runMoe and runMoeMinLantency.
|
||||
raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe")
|
||||
|
||||
|
||||
def flashinfer_cutlass_moe_fp4(
|
||||
hidden_states: torch.Tensor,
|
||||
|
||||
@@ -376,6 +376,7 @@ class SupportsLoRA(Protocol):
|
||||
MRO of your model class.
|
||||
"""
|
||||
is_3d_moe_weight: ClassVar[bool] = False
|
||||
is_non_gated_moe: ClassVar[bool] = False
|
||||
# The `embedding_module` and `embedding_padding_modules`
|
||||
# are empty by default.
|
||||
embedding_modules: ClassVar[dict[str, str]] = {}
|
||||
|
||||
@@ -747,6 +747,9 @@ class NemotronHForCausalLM(
|
||||
MixtureOfExperts,
|
||||
SupportsMambaPrefixCaching,
|
||||
):
|
||||
# Relevant only if self.has_moe is True
|
||||
is_non_gated_moe: bool = True
|
||||
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
orig_to_new_prefix={"backbone": "model"},
|
||||
orig_to_new_substr={"A_log": "A", "embeddings": "embed_tokens"},
|
||||
|
||||
Reference in New Issue
Block a user