Add support for LoRA adapters in Nemotron-H models (#30802)

Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
2026-01-19 16:30:44 +02:00
parent c88860d759
commit aa7f37ccfa
10 changed files with 497 additions and 27 deletions
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -227,6 +227,11 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
            use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
        )

+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
+        # No support for LoRA in flashinfer_cutlass_fused_moe.
+        # See TODOs in flashinfer functions runMoe and runMoeMinLantency.
+        raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe")
+

 def flashinfer_cutlass_moe_fp4(
    hidden_states: torch.Tensor,
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -376,6 +376,7 @@ class SupportsLoRA(Protocol):
        MRO of your model class.
    """
    is_3d_moe_weight: ClassVar[bool] = False
+    is_non_gated_moe: ClassVar[bool] = False
    # The `embedding_module` and `embedding_padding_modules`
    # are empty by default.
    embedding_modules: ClassVar[dict[str, str]] = {}
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -747,6 +747,9 @@ class NemotronHForCausalLM(
    MixtureOfExperts,
    SupportsMambaPrefixCaching,
 ):
+    # Relevant only if self.has_moe is True
+    is_non_gated_moe: bool = True
+
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={"backbone": "model"},
        orig_to_new_substr={"A_log": "A", "embeddings": "embed_tokens"},