[XPU] bump up xpu-kernel v0.1.5, transpose moe weights (#38342)
Signed-off-by: mayuyuace <qiming1.zhang@intel.com> Signed-off-by: Qiming Zhang <qiming1.zhang@intel.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
@@ -15,4 +15,4 @@ torch==2.10.0+xpu
|
||||
torchaudio
|
||||
torchvision
|
||||
|
||||
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.4/vllm_xpu_kernels-0.1.4-cp38-abi3-manylinux_2_28_x86_64.whl
|
||||
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.5/vllm_xpu_kernels-0.1.5-cp38-abi3-manylinux_2_28_x86_64.whl
|
||||
|
||||
@@ -222,6 +222,18 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
|
||||
else:
|
||||
self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
|
||||
elif current_platform.is_xpu():
|
||||
w13 = layer.w13_weight
|
||||
w2 = layer.w2_weight
|
||||
|
||||
w13.data = w13.transpose(-1, -2).contiguous()
|
||||
w2.data = w2.transpose(-1, -2).contiguous()
|
||||
|
||||
self._setup_kernel(
|
||||
layer=layer,
|
||||
w13=w13,
|
||||
w2=w2,
|
||||
)
|
||||
else:
|
||||
self._setup_kernel(
|
||||
layer=layer,
|
||||
|
||||
@@ -1028,6 +1028,10 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
|
||||
layer.w2_weight[expert, :, :]
|
||||
)
|
||||
|
||||
if current_platform.is_xpu():
|
||||
w13.data = w13.transpose(-1, -2).contiguous()
|
||||
w2.data = w2.transpose(-1, -2).contiguous()
|
||||
|
||||
# Shuffle weights to runtime format and setup kernel.
|
||||
self._setup_kernel(
|
||||
layer,
|
||||
|
||||
Reference in New Issue
Block a user