[Quantization][ROCm] Fix MoE weight loading to be robust (Qwen3_MoE/Qwen3_next as example models) (#33173)
Signed-off-by: xuebwang-amd <xuebwang@amd.com>
This commit is contained in:
8
vllm/model_executor/layers/fused_moe/layer.py
Normal file → Executable file
8
vllm/model_executor/layers/fused_moe/layer.py
Normal file → Executable file
@@ -996,7 +996,9 @@ class FusedMoE(CustomOp):
|
|||||||
shard_size = expert_data.shape[shard_dim] // 2
|
shard_size = expert_data.shape[shard_dim] // 2
|
||||||
else:
|
else:
|
||||||
shard_size = expert_data.shape[shard_dim]
|
shard_size = expert_data.shape[shard_dim]
|
||||||
if not load_full:
|
# Only narrow if the loaded_weight is not a scalar (0-dim tensor)
|
||||||
|
# and we're not loading the full weight
|
||||||
|
if not load_full and loaded_weight.ndim > 0:
|
||||||
loaded_weight = loaded_weight.narrow(
|
loaded_weight = loaded_weight.narrow(
|
||||||
shard_dim, shard_size * tp_rank, shard_size
|
shard_dim, shard_size * tp_rank, shard_size
|
||||||
)
|
)
|
||||||
@@ -1022,7 +1024,9 @@ class FusedMoE(CustomOp):
|
|||||||
# down_proj: "RowParallel" so tp sharding on input_dim
|
# down_proj: "RowParallel" so tp sharding on input_dim
|
||||||
# Narrow parameter and load.
|
# Narrow parameter and load.
|
||||||
shard_size = expert_data.shape[shard_dim]
|
shard_size = expert_data.shape[shard_dim]
|
||||||
if not load_full:
|
# Only narrow if the loaded_weight is not a scalar (0-dim tensor)
|
||||||
|
# and we're not loading the full weight
|
||||||
|
if not load_full and loaded_weight.ndim > 0:
|
||||||
loaded_weight = loaded_weight.narrow(
|
loaded_weight = loaded_weight.narrow(
|
||||||
shard_dim, shard_size * tp_rank, shard_size
|
shard_dim, shard_size * tp_rank, shard_size
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user