[Quantization] Enable BNB support for InternS1 (#21953)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-01 19:09:54 +08:00
parent 4931486988
commit 28b18cc741
2 changed files with 43 additions and 16 deletions
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for model executor."""
+
 import copy
 from typing import Any, Optional

@@ -9,6 +10,7 @@ import torch

 def set_random_seed(seed: int) -> None:
    from vllm.platforms import current_platform
+
    current_platform.seed_everything(seed)


@@ -29,7 +31,7 @@ def set_weight_attrs(
        return
    for key, value in weight_attrs.items():
        assert not hasattr(
-            weight, key), (f"Overwriting existing tensor attribute: {key}")
+            weight, key), f"Overwriting existing tensor attribute: {key}"

        # NOTE(woosuk): During weight loading, we often do something like:
        # narrowed_tensor = param.data.narrow(0, offset, len)
@@ -41,6 +43,7 @@ def set_weight_attrs(
        # we sync the param tensor after its weight loader is called.
        # TODO(woosuk): Remove this hack once we have a better solution.
        from vllm.platforms import current_platform
+
        if current_platform.is_tpu() and key == "weight_loader":
            value = _make_synced_weight_loader(value)
        setattr(weight, key, value)
@@ -77,4 +80,17 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
                f"safely because of conflicts from {type(child).__name__}.")
        else:
            parent_map.update(child_map)
-    return parent_map
+    return parent_map
+
+
+def get_moe_expert_mapping(
+    model: torch.nn.Module, ) -> list[tuple[str, str, int, str]]:
+    if parent_map := getattr(model, "get_expert_mapping", None):
+        return parent_map()
+    else:
+        # We only check main components instead of whole model submodules
+        for child in model.children():
+            child_map = getattr(child, "get_expert_mapping", None)
+            if child_map is not None:
+                return child_map()
+        return []