[Quantization][ROCm] Fix MoE weight loading to be robust (Qwen3_MoE/Qwen3_next as example models) (#33173)
Signed-off-by: xuebwang-amd <xuebwang@amd.com>
This commit is contained in:
8
vllm/model_executor/layers/fused_moe/layer.py
Normal file → Executable file
8
vllm/model_executor/layers/fused_moe/layer.py
Normal file → Executable file
@@ -996,7 +996,9 @@ class FusedMoE(CustomOp):
|
||||
shard_size = expert_data.shape[shard_dim] // 2
|
||||
else:
|
||||
shard_size = expert_data.shape[shard_dim]
|
||||
if not load_full:
|
||||
# Only narrow if the loaded_weight is not a scalar (0-dim tensor)
|
||||
# and we're not loading the full weight
|
||||
if not load_full and loaded_weight.ndim > 0:
|
||||
loaded_weight = loaded_weight.narrow(
|
||||
shard_dim, shard_size * tp_rank, shard_size
|
||||
)
|
||||
@@ -1022,7 +1024,9 @@ class FusedMoE(CustomOp):
|
||||
# down_proj: "RowParallel" so tp sharding on input_dim
|
||||
# Narrow parameter and load.
|
||||
shard_size = expert_data.shape[shard_dim]
|
||||
if not load_full:
|
||||
# Only narrow if the loaded_weight is not a scalar (0-dim tensor)
|
||||
# and we're not loading the full weight
|
||||
if not load_full and loaded_weight.ndim > 0:
|
||||
loaded_weight = loaded_weight.narrow(
|
||||
shard_dim, shard_size * tp_rank, shard_size
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user