[MoE Refactor] Oracle Select FP8+NVFP4 Kernels In Priority (#32414)
This commit is contained in:
@@ -8,6 +8,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
|
||||
@@ -193,16 +194,18 @@ def run_with_expert_maps(
|
||||
|
||||
out_tensor = torch.zeros_like(cutlass_moe_kwargs["hidden_states"])
|
||||
for kwargs, new_quant_config in slice_experts():
|
||||
w2 = kwargs["w2"]
|
||||
a = kwargs["hidden_states"]
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=kwargs["hidden_states"].dtype,
|
||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
||||
e=kwargs["w2"].shape[0], # type: ignore[union-attr]
|
||||
n=kwargs["w2"].shape[2], # type: ignore[union-attr]
|
||||
k=kwargs["w2"].shape[1], # type: ignore[union-attr]
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=new_quant_config,
|
||||
device="cuda",
|
||||
),
|
||||
)
|
||||
out_tensor = out_tensor + kernel(**kwargs)
|
||||
@@ -249,19 +252,19 @@ def run_8_bit(
|
||||
"topk_ids": topk_ids,
|
||||
}
|
||||
|
||||
num_experts = moe_tensors.w1.size(0)
|
||||
num_experts = moe_tensors.w1.size(0) # type: ignore[attr-defined]
|
||||
with_ep = num_local_experts is not None or num_local_experts == num_experts
|
||||
if not with_ep:
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=moe_tensors.a.dtype,
|
||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
||||
e=moe_tensors.w2_q.shape[0], # type: ignore[union-attr]
|
||||
n=moe_tensors.w2_q.shape[2], # type: ignore[union-attr]
|
||||
k=moe_tensors.w2_q.shape[1], # type: ignore[union-attr]
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=moe_tensors.w2_q.shape[0], # type: ignore[union-attr]
|
||||
hidden_dim=moe_tensors.w2_q.shape[1], # type: ignore[union-attr]
|
||||
intermediate_size_per_partition=moe_tensors.w2_q.shape[2], # type: ignore[union-attr]
|
||||
in_dtype=moe_tensors.a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device="cuda",
|
||||
),
|
||||
)
|
||||
return kernel(**kwargs)
|
||||
|
||||
Reference in New Issue
Block a user