From f451b4558b2bb42dafcdd19f7b5c0fc58194af5a Mon Sep 17 00:00:00 2001
From: xuebwang-amd <xuebwang@amd.com>
Date: Sat, 31 Jan 2026 01:50:23 +0800
Subject: [PATCH] [Quantization][ROCm] Fix MoE weight loading to be robust
 (Qwen3_MoE/Qwen3_next as example models) (#33173)

Signed-off-by: xuebwang-amd <xuebwang@amd.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 vllm/model_executor/layers/fused_moe/layer.py

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
old mode 100644
new mode 100755
index 538089882..d095e4074
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -996,7 +996,9 @@ class FusedMoE(CustomOp):
             shard_size = expert_data.shape[shard_dim] // 2
         else:
             shard_size = expert_data.shape[shard_dim]
-        if not load_full:
+        # Only narrow if the loaded_weight is not a scalar (0-dim tensor)
+        # and we're not loading the full weight
+        if not load_full and loaded_weight.ndim > 0:
             loaded_weight = loaded_weight.narrow(
                 shard_dim, shard_size * tp_rank, shard_size
             )
@@ -1022,7 +1024,9 @@ class FusedMoE(CustomOp):
         # down_proj: "RowParallel" so tp sharding on input_dim
         # Narrow parameter and load.
         shard_size = expert_data.shape[shard_dim]
-        if not load_full:
+        # Only narrow if the loaded_weight is not a scalar (0-dim tensor)
+        # and we're not loading the full weight
+        if not load_full and loaded_weight.ndim > 0:
             loaded_weight = loaded_weight.narrow(
                 shard_dim, shard_size * tp_rank, shard_size
             )