From f451b4558b2bb42dafcdd19f7b5c0fc58194af5a Mon Sep 17 00:00:00 2001 From: xuebwang-amd Date: Sat, 31 Jan 2026 01:50:23 +0800 Subject: [PATCH] [Quantization][ROCm] Fix MoE weight loading to be robust (Qwen3_MoE/Qwen3_next as example models) (#33173) Signed-off-by: xuebwang-amd --- vllm/model_executor/layers/fused_moe/layer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) mode change 100644 => 100755 vllm/model_executor/layers/fused_moe/layer.py diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py old mode 100644 new mode 100755 index 538089882..d095e4074 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -996,7 +996,9 @@ class FusedMoE(CustomOp): shard_size = expert_data.shape[shard_dim] // 2 else: shard_size = expert_data.shape[shard_dim] - if not load_full: + # Only narrow if the loaded_weight is not a scalar (0-dim tensor) + # and we're not loading the full weight + if not load_full and loaded_weight.ndim > 0: loaded_weight = loaded_weight.narrow( shard_dim, shard_size * tp_rank, shard_size ) @@ -1022,7 +1024,9 @@ class FusedMoE(CustomOp): # down_proj: "RowParallel" so tp sharding on input_dim # Narrow parameter and load. shard_size = expert_data.shape[shard_dim] - if not load_full: + # Only narrow if the loaded_weight is not a scalar (0-dim tensor) + # and we're not loading the full weight + if not load_full and loaded_weight.ndim > 0: loaded_weight = loaded_weight.narrow( shard_dim, shard_size * tp_rank, shard_size )