[Kernel] [Quantization] Add MXFP4 and bias support for marlin kernel (#22428)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Animesh Jain <anijain@umich.edu>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: yan <yan.ma@intel.com>
Signed-off-by: Yan Ma <yan.ma@intel.com>
Signed-off-by: Xiao Liu <xiszishu@gmail.com>
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
Signed-off-by: Andy Xie <andy.xning@gmail.com>
Signed-off-by: Haibin Lin <haibin.lin@bytedance.com>
Signed-off-by: David Ben-David <davidb@pliops.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: huangweixiao <huangweixiao@msh.team>
Signed-off-by: alyosha-swamy <raghav@arcee.ai>
Signed-off-by: Eric Hanley <ericehanley@google.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: CLFutureX <775523362@qq.com>
Signed-off-by: Linkun Chen <github@lkchen.net>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: tlipoca9 <tlipoca9@gmail.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Benji Beck <benjibeck@meta.com>
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Zhang Jason <ning.zhang2@amd.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: asafg <asafg@ai21.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Lain <fusiyuan2000@hotmail.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk>
Signed-off-by: Lionel Villard <villard@us.ibm.com>
Signed-off-by: ycyaw66 <497410282@qq.com>
Signed-off-by: David Chen <530634352@qq.com>
Signed-off-by: Linkun <github@lkchen.net>
Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com>
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Signed-off-by: Junhao Li <junhao@ubicloud.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Signed-off-by: <zyy1102000@gmail.com>
Signed-off-by: Guy Stone <guys@spotify.com>
Signed-off-by: <yyweiss@gmail.com>
Signed-off-by: yyw <yyweiss@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Animesh Jain <jainanimesh2305@yahoo.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: XiongfeiWei <isaacwxf23@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: JartX <sagformas@gmail.com>
Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: tjtanaavllm <tunjian.tan@amd.com>
Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Yan Ma <yan.ma@intel.com>
Co-authored-by: Xiao <xiszishu@gmail.com>
Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: Ning Xie <andy.xning@gmail.com>
Co-authored-by: H <linhaibin.eric@gmail.com>
Co-authored-by: David Ben-David <sdavidbd@gmail.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: TankNee <nee@tanknee.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: ZiTian.Zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Abirdcfly <fp544037857@gmail.com>
Co-authored-by: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com>
Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu>
Co-authored-by: Chenxi Yang <cxyang@meta.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Weixiao Huang <hwx.simle@gmail.com>
Co-authored-by: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com>
Co-authored-by: ericehanley <ericehanley@google.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com>
Co-authored-by: PiteXChen <44110731+CLFutureX@users.noreply.github.com>
Co-authored-by: lkchen <github@lkchen.net>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: tlipoca9 <160737620+tlipoca9@users.noreply.github.com>
Co-authored-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Zhang Jason <ning.zhang2@amd.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: asafg <asafg@ai21.com>
Co-authored-by: Lain <siyuanf@nvidia.com>
Co-authored-by: tc-mb <157115220+tc-mb@users.noreply.github.com>
Co-authored-by: imning3 <hbning@pku.edu.cn>
Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: qscqesze <qingjun@minimaxi.com>
Co-authored-by: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com>
Co-authored-by: Lionel Villard <villard@us.ibm.com>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
Co-authored-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Co-authored-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: Adrián García García <adrigarvk8@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: JaceyShao <65159281+JaceyShao@users.noreply.github.com>
Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Co-authored-by: Ricardo Decal <crypdick@users.noreply.github.com>
Co-authored-by: Andrew Chan <andrewkchan.akc@gmail.com>
Co-authored-by: fxmarty-amd <felmarty@amd.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Zhiyu <zhiyuc@nvidia.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
Co-authored-by: Junhao Li <streaver91@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Guy Stone <guys@spotify.com>
Co-authored-by: yyweiss <70619747+yyweiss@users.noreply.github.com>
Co-authored-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
This commit is contained in:
Jinzhen Lin
2025-08-15 02:23:22 +08:00
committed by GitHub
parent ab9f2cfd19
commit 33c63e9547
34 changed files with 1128 additions and 324 deletions

View File

@@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
check_marlin_supports_layer, check_moe_marlin_supports_layer,
marlin_make_empty_g_idx, marlin_make_workspace_new,
marlin_moe_permute_scales, marlin_permute_scales,
marlin_moe_permute_scales, marlin_permute_bias, marlin_permute_scales,
moe_awq_to_marlin_zero_points, verify_marlin_supported,
verify_marlin_supports_shape)
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -303,6 +303,9 @@ class AWQMarlinLinearMethod(LinearMethodBase):
layer.g_idx = marlin_make_empty_g_idx(device)
layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
if hasattr(layer, "bias") and layer.bias is not None:
layer.bias.data = marlin_permute_bias(layer.bias)
def apply(
self,
layer: torch.nn.Module,
@@ -469,6 +472,12 @@ class AWQMoEMethod(FusedMoEMethodBase):
num_bits=self.quant_config.weight_bits)
replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
layer.w13_bias.data = marlin_permute_bias(layer.w13_bias)
if hasattr(layer, "w2_bias") and layer.w2_bias is not None:
layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
def apply(
self,
layer: torch.nn.Module,
@@ -513,6 +522,8 @@ class AWQMoEMethod(FusedMoEMethodBase):
x,
layer.w13_qweight,
layer.w2_qweight,
getattr(layer, "w13_bias", None),
getattr(layer, "w2_bias", None),
layer.w13_scales,
layer.w2_scales,
router_logits,

View File

@@ -324,6 +324,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
x,
layer.w13_weight,
layer.w2_weight,
None,
None,
layer.w13_weight_scale,
layer.w2_weight_scale,
router_logits,
@@ -795,6 +797,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
x,
layer.w13_weight,
layer.w2_weight,
None,
None,
layer.w13_weight_scale,
layer.w2_weight_scale,
router_logits,
@@ -1253,6 +1257,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
x,
layer.w13_weight_packed,
layer.w2_weight_packed,
None,
None,
layer.w13_weight_scale,
layer.w2_weight_scale,
router_logits,

View File

@@ -983,6 +983,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
x,
layer.w13_weight,
layer.w2_weight,
None,
None,
layer.w13_weight_scale,
layer.w2_weight_scale,
router_logits,

View File

@@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization.utils.gptq_utils import (
get_dynamic_override, get_linear_quant_method, override_config)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
check_marlin_supported, check_moe_marlin_supports_layer,
marlin_make_workspace_new, marlin_moe_permute_scales,
marlin_make_workspace_new, marlin_moe_permute_scales, marlin_permute_bias,
marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
GroupQuantScaleParameter,
@@ -618,6 +618,12 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
)
replace_parameter(layer, "w2_scales", marlin_w2_scales)
if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
layer.w13_bias.data = marlin_permute_bias(layer.w13_bias)
if hasattr(layer, "w2_bias") and layer.w2_bias is not None:
layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
def apply(
self,
layer: torch.nn.Module,
@@ -662,6 +668,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
x,
layer.w13_qweight,
layer.w2_qweight,
getattr(layer, "w13_bias", None),
getattr(layer, "w2_bias", None),
layer.w13_scales,
layer.w2_scales,
router_logits,

View File

@@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
marlin_make_empty_g_idx, marlin_permute_scales)
marlin_make_empty_g_idx, marlin_permute_bias, marlin_permute_scales)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace)
from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
@@ -284,6 +284,9 @@ class HQQMarlinMethod(LinearMethodBase):
layer.marlin_zeros = marlin_zp
layer.marlin_scales = marlin_s
if hasattr(layer, "bias") and layer.bias is not None:
layer.bias.data = marlin_permute_bias(layer.bias)
def apply(
self,
layer: torch.nn.Module,
@@ -307,6 +310,7 @@ class HQQMarlinMethod(LinearMethodBase):
x,
None,
layer.marlin_qweight,
bias,
scales,
None,
zeros,
@@ -326,7 +330,4 @@ class HQQMarlinMethod(LinearMethodBase):
if orig_type != torch.float16:
marlin_out = marlin_out.to(orig_type)
if bias is not None:
marlin_out.add_(bias)
return marlin_out

View File

@@ -9,8 +9,9 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
marlin_make_workspace_new, marlin_permute_scales, marlin_sort_g_idx,
marlin_zero_points, query_marlin_supported_quant_types, unpack_cols)
marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
marlin_sort_g_idx, marlin_zero_points, query_marlin_supported_quant_types,
unpack_cols)
from vllm.model_executor.parameter import (BasevLLMParameter,
permute_param_layout_)
from vllm.platforms import current_platform
@@ -111,6 +112,9 @@ class MarlinLinearKernel(MPLinearKernel):
self._transform_param(layer, self.w_q_name, transform_w_q)
self._transform_param(layer, self.w_s_name, transform_w_s)
if hasattr(layer, "bias") and layer.bias is not None:
layer.bias.data = marlin_permute_bias(layer.bias)
def apply_weights(self,
layer: torch.nn.Module,
x: torch.Tensor,

View File

@@ -1330,6 +1330,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
x,
layer.w13_weight,
layer.w2_weight,
None,
None,
layer.w13_weight_scale,
layer.w2_weight_scale,
router_logits,

View File

@@ -15,13 +15,17 @@ from vllm.model_executor.layers.linear import (LinearBase,
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
prepare_moe_fp4_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
_can_support_mxfp4, _swizzle_mxfp4)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
is_layer_skipped)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.utils import next_power_of_2, round_up
from vllm.scalar_type import scalar_types
from vllm.utils import (has_triton_kernels, is_torch_equal_or_newer,
next_power_of_2, round_up)
if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
@@ -81,6 +85,21 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
super().__init__()
self.topk_indices_dtype = None
self.moe = moe
self.use_marlin = self._should_use_marlin()
def _should_use_marlin(self):
if envs.VLLM_MXFP4_USE_MARLIN is not None:
return envs.VLLM_MXFP4_USE_MARLIN
if current_platform.is_cuda() and \
not current_platform.has_device_capability(100):
if not current_platform.is_device_capability(90):
# marlin kernel has better performance on ampere
return True
if not has_triton_kernels():
return True
if not is_torch_equal_or_newer("2.8.0"):
return True
return False
def create_weights(self, layer: torch.nn.Module, num_experts: int,
hidden_size: int, intermediate_size_per_partition: int,
@@ -101,11 +120,29 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
intermediate_size_per_partition_after_pad = \
intermediate_size_per_partition
# pad the intermediate size to be a multiple of 2 * mxfp4_block
# for to hold non-uniform sharded tensor as well as swizzling
# other padding to increase performance
if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
if self.use_marlin:
# The moe marlin kernel requires that for each linear
# n % 256 == 0 and k % 128 == 0.
# In gate_up_proj:
# n = 2 * intermediate_size_per_partition_after_pad
# k = hidden_size
# In down_proj
# n = hidden_size
# k = intermediate_size_per_partition_after_pad
intermediate_size_per_partition_after_pad = round_up(
intermediate_size_per_partition, 128)
hidden_size = round_up(hidden_size, 256)
layer.params_dtype = params_dtype
layer.num_experts = num_experts
layer.hidden_size = hidden_size
layer.intermediate_size_per_partition = \
intermediate_size_per_partition_after_pad
elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
# pad the intermediate size to be a multiple of 2 * mxfp4_block
# for to hold non-uniform sharded tensor as well as swizzling
# other padding to increase performance
intermediate_size_per_partition_after_pad = round_up(
intermediate_size_per_partition, 256)
hidden_size = round_up(hidden_size, 256)
@@ -191,8 +228,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
set_weight_attrs(w2_bias, extra_weight_attrs)
def process_weights_after_loading(self, layer):
if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
if self.use_marlin:
prepare_moe_fp4_layer_for_marlin(layer)
elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
layer.gemm1_alpha = Parameter(torch.tensor(
[1.702] * self.num_experts, dtype=torch.float32).cuda(),
requires_grad=False)
@@ -399,13 +438,45 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
if enable_eplb:
raise NotImplementedError("EPLB is not supported for mxfp4")
if self.use_marlin:
topk_weights, topk_ids = FusedMoE.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
e_score_correction_bias=e_score_correction_bias)
return torch.ops.vllm.fused_marlin_moe(
x,
layer.w13_weight,
layer.w2_weight,
layer.w13_bias,
layer.w2_bias,
layer.w13_weight_scale,
layer.w2_weight_scale,
router_logits,
topk_weights,
topk_ids,
global_scale1=None,
global_scale2=None,
quant_type_id=scalar_types.float4_e2m1f.id,
apply_router_weight_on_input=apply_router_weight_on_input,
global_num_experts=global_num_experts,
activation=activation,
expert_map=expert_map)
assert _can_support_mxfp4(
use_grouped_topk, topk_group, num_expert_group, expert_map,
custom_routing_function, e_score_correction_bias,
apply_router_weight_on_input, scoring_func, activation,
expert_load_view, logical_to_physical_map,
logical_replica_count), ("MXFP4 are not supported\
with this configuration.")
logical_replica_count), (
"MXFP4 are not supported with this configuration.")
if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):

View File

@@ -261,6 +261,13 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
return s
def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
origin_shape = s.shape
_, scale_perm_single = get_scale_perms()
s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
return s.reshape(*origin_shape).contiguous()
def marlin_moe_permute_scales(
s: torch.Tensor,
size_k: int,
@@ -410,6 +417,7 @@ def apply_gptq_marlin_linear(
output = ops.gptq_marlin_gemm(reshaped_x,
None,
weight,
bias,
weight_scale,
None,
weight_zp,
@@ -425,9 +433,6 @@ def apply_gptq_marlin_linear(
use_fp32_reduce=use_fp32_reduce,
is_zp_float=False)
if bias is not None:
output.add_(bias) # In-place add
return output.reshape(out_shape)
@@ -456,6 +461,7 @@ def apply_awq_marlin_linear(
output = ops.gptq_marlin_gemm(reshaped_x,
None,
weight,
bias,
weight_scale,
None,
weight_zp,
@@ -470,7 +476,4 @@ def apply_awq_marlin_linear(
use_fp32_reduce=use_fp32_reduce,
is_zp_float=False)
if bias is not None:
output.add_(bias) # In-place add
return output.reshape(out_shape)

View File

@@ -8,8 +8,8 @@ import torch
import vllm._custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
should_use_atomic_add_reduce)
USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
marlin_permute_scales, should_use_atomic_add_reduce)
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
@@ -22,7 +22,7 @@ def is_fp4_marlin_supported():
return current_platform.has_device_capability(80)
def fp4_marlin_process_scales(marlin_scales):
def nvfp4_marlin_process_scales(marlin_scales):
if not (marlin_scales >= 0).all():
logger.warning_once(
"NVFP4 Marlin assumes the scales to be >=0, but has encountered "
@@ -56,7 +56,20 @@ def fp4_marlin_process_scales(marlin_scales):
return marlin_scales
def fp4_marlin_process_global_scale(global_scale):
def mxfp4_marlin_process_scales(marlin_scales):
# 8 is the number of scale number using by one thread
marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
marlin_scales.size(0) * 2, -1)
# fit the layout of fp8 dequantization
marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
marlin_scales.size(0), -1)
marlin_scales = marlin_scales.to(torch.float8_e8m0fnu)
return marlin_scales
def nvfp4_marlin_process_global_scale(global_scale):
assert global_scale.dtype in [torch.half, torch.bfloat16]
fp4_exponent = 2
if global_scale.dtype == torch.half:
@@ -73,7 +86,7 @@ def apply_fp4_marlin_linear(
input: torch.Tensor,
weight: torch.Tensor,
weight_scale: torch.Tensor,
weight_scale_2: torch.Tensor,
weight_scale_2: Optional[torch.Tensor],
workspace: torch.Tensor,
size_n: int,
size_k: int,
@@ -94,6 +107,7 @@ def apply_fp4_marlin_linear(
output = ops.gptq_marlin_gemm(a=reshaped_x,
c=None,
b_q_weight=weight,
b_bias=bias,
b_scales=weight_scale,
global_scale=weight_scale_2,
b_zeros=None,
@@ -107,9 +121,6 @@ def apply_fp4_marlin_linear(
use_atomic_add=use_atomic_add,
use_fp32_reduce=use_fp32_reduce)
if bias is not None:
output.add_(bias) # In-place add
return output.reshape(out_shape)
@@ -120,6 +131,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads.")
is_nvfp4 = hasattr(layer, "weight_scale_2")
group_size = 16 if is_nvfp4 else 32
part_size_n = layer.output_size_per_partition
part_size_k = layer.input_size_per_partition
param_dtype = layer.params_dtype
@@ -145,18 +159,35 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
# WEIGHT SCALES
# Permute scales
weight_scale = layer.weight_scale.T.to(param_dtype)
weight_scale = layer.weight_scale.T.contiguous()
if not is_nvfp4:
weight_scale = weight_scale.view(torch.float8_e8m0fnu)
weight_scale = weight_scale.to(param_dtype)
weight_scale = marlin_permute_scales(s=weight_scale,
size_k=part_size_k,
size_n=part_size_n,
group_size=16)
weight_scale = fp4_marlin_process_scales(weight_scale)
layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
group_size=group_size)
weight_scale_2 = layer.weight_scale_2.to(param_dtype)
weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
requires_grad=False)
if is_nvfp4:
weight_scale = nvfp4_marlin_process_scales(weight_scale)
layer.weight_scale = torch.nn.Parameter(weight_scale,
requires_grad=False)
weight_scale_2 = layer.weight_scale_2.to(param_dtype)
weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2)
layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
requires_grad=False)
else:
weight_scale = mxfp4_marlin_process_scales(weight_scale)
layer.weight_scale = torch.nn.Parameter(weight_scale,
requires_grad=False)
if hasattr(layer, "bias") and layer.bias is not None:
assert layer.bias.shape == (part_size_n, )
bias = marlin_permute_bias(layer.bias)
layer.bias = torch.nn.Parameter(bias, requires_grad=False)
return
@@ -168,6 +199,9 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads.")
is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
group_size = 16 if is_nvfp4 else 32
e = layer.num_experts
k = layer.hidden_size
n = layer.intermediate_size_per_partition
@@ -208,8 +242,13 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
# WEIGHT SCALES
# Permute scales
for name in ["w13", "w2"]:
scales = getattr(layer, name + "_weight_scale").to(param_dtype)
global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
scales = getattr(layer, name + "_weight_scale")
if not is_nvfp4:
scales = scales.view(torch.float8_e8m0fnu)
scales = scales.to(param_dtype)
if is_nvfp4:
global_scale = getattr(layer,
name + "_weight_scale_2").to(param_dtype)
tensor_list = []
if "w13" in name:
@@ -218,23 +257,47 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
size_n, size_k = k, n
for i in range(e):
marlin_scales = marlin_permute_scales(s=scales[i].T,
scale = scales[i].T
marlin_scales = marlin_permute_scales(s=scale,
size_k=size_k,
size_n=size_n,
group_size=16)
marlin_scales = fp4_marlin_process_scales(marlin_scales)
group_size=group_size)
if is_nvfp4:
marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
else:
marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
tensor_list.append(marlin_scales)
scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
scales = torch.nn.Parameter(scales, requires_grad=False)
setattr(layer, name + "_weight_scale", scales)
global_scale = fp4_marlin_process_global_scale(global_scale)
global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
setattr(layer, name + "_weight_scale_2", global_scale)
if is_nvfp4:
global_scale = nvfp4_marlin_process_global_scale(global_scale)
global_scale = torch.nn.Parameter(global_scale,
requires_grad=False)
setattr(layer, name + "_weight_scale_2", global_scale)
# BIAS
# Permute bias
for name in ["w13_bias", "w2_bias"]:
if not hasattr(layer, name):
continue
bias = getattr(layer, name).to(param_dtype)
tensor_list = []
for i in range(e):
expert_bias = bias[i]
tensor_list.append(marlin_permute_bias(expert_bias))
bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
bias = torch.nn.Parameter(bias, requires_grad=False)
setattr(layer, name, bias)
def rand_marlin_weight_fp4_like(weight, group_size):
def rand_marlin_weight_nvfp4_like(weight, group_size):
assert group_size > 0
size_n, size_k = weight.shape
device = weight.device
@@ -276,8 +339,58 @@ def rand_marlin_weight_fp4_like(weight, group_size):
size_k=size_k,
size_n=size_n,
group_size=group_size)
marlin_scales = fp4_marlin_process_scales(marlin_scales)
marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
global_scale = fp4_marlin_process_global_scale(global_scale)
global_scale = nvfp4_marlin_process_global_scale(global_scale)
return weight_ref.T, marlin_qweight, marlin_scales, global_scale
def rand_marlin_weight_mxfp4_like(weight, group_size):
assert group_size > 0
size_n, size_k = weight.shape
device = weight.device
scales = torch.randint(100,
125, (size_n, size_k // group_size),
dtype=torch.uint8,
device=weight.device)
scales = scales.view(torch.float8_e8m0fnu)
fp4_weight = torch.randint(0,
256, (size_n, size_k // 2),
dtype=torch.uint8,
device=weight.device)
fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
((fp4_weight & 0b01110000) >> 2))
fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
fp4_weight2 = fp4_weight << 4
fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
((fp4_weight2 & 0b01110000) >> 2))
fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
weight_ref = torch.cat(
[fp4_weight_part_2.unsqueeze(2),
fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
weight_ref = weight_ref * \
scales.repeat_interleave(group_size, 1).to(weight.dtype)
marlin_qweight = ops.gptq_marlin_repack(
b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
perm=torch.empty(0, dtype=torch.int, device=device),
size_k=size_k,
size_n=size_n,
num_bits=4,
)
marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
size_k=size_k,
size_n=size_n,
group_size=group_size)
marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
return weight_ref.T, marlin_qweight, marlin_scales.to(torch.float8_e8m0fnu)

View File

@@ -8,8 +8,8 @@ import torch
import vllm._custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
should_use_atomic_add_reduce)
USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
marlin_permute_scales, should_use_atomic_add_reduce)
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
@@ -58,6 +58,7 @@ def apply_fp8_marlin_linear(
output = ops.gptq_marlin_gemm(a=reshaped_x,
c=None,
b_q_weight=weight,
b_bias=bias,
b_scales=weight_scale,
global_scale=None,
b_zeros=None,
@@ -71,9 +72,6 @@ def apply_fp8_marlin_linear(
use_atomic_add=use_atomic_add,
use_fp32_reduce=use_fp32_reduce)
if bias is not None:
output.add_(bias) # In-place add
return output.reshape(out_shape)
@@ -160,6 +158,11 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
if hasattr(layer, "bias") and layer.bias is not None:
assert layer.bias.shape == (part_size_n, )
bias = marlin_permute_bias(layer.bias)
layer.bias = torch.nn.Parameter(bias, requires_grad=False)
def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
size_k_first: bool = True) -> None:
@@ -274,6 +277,23 @@ def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
setattr(layer, name + "_weight_scale", scales)
# BIAS
# Permute bias
for name in ["w13_bias", "w2_bias"]:
if not hasattr(layer, name):
continue
bias = getattr(layer, name).to(layer.orig_dtype)
tensor_list = []
for i in range(e):
expert_bias = bias[i]
tensor_list.append(marlin_permute_bias(expert_bias))
bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
bias = torch.nn.Parameter(bias, requires_grad=False)
setattr(layer, name, bias)
def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
size_k_first: bool = True) -> torch.Tensor:

View File

@@ -61,7 +61,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
e_score_correction_bias: Optional[torch.Tensor] = None,
apply_router_weight_on_input: bool = False,
scoring_func: str = "softmax",
activation: str = "silu",
activation: str = "swiglu_oai",
expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None):