[Quantization][ROCm] Fix MoE weight loading to be robust (Qwen3_MoE/Qwen3_next as example models) (#33173)

Signed-off-by: xuebwang-amd <xuebwang@amd.com>
This commit is contained in:
xuebwang-amd
2026-01-31 01:50:23 +08:00
committed by GitHub
parent 3f96fcf646
commit f451b4558b

8
vllm/model_executor/layers/fused_moe/layer.py Normal file → Executable file
View File

@@ -996,7 +996,9 @@ class FusedMoE(CustomOp):
shard_size = expert_data.shape[shard_dim] // 2
else:
shard_size = expert_data.shape[shard_dim]
if not load_full:
# Only narrow if the loaded_weight is not a scalar (0-dim tensor)
# and we're not loading the full weight
if not load_full and loaded_weight.ndim > 0:
loaded_weight = loaded_weight.narrow(
shard_dim, shard_size * tp_rank, shard_size
)
@@ -1022,7 +1024,9 @@ class FusedMoE(CustomOp):
# down_proj: "RowParallel" so tp sharding on input_dim
# Narrow parameter and load.
shard_size = expert_data.shape[shard_dim]
if not load_full:
# Only narrow if the loaded_weight is not a scalar (0-dim tensor)
# and we're not loading the full weight
if not load_full and loaded_weight.ndim > 0:
loaded_weight = loaded_weight.narrow(
shard_dim, shard_size * tp_rank, shard_size
)