[Kernel] [Quantization] Add MXFP4 and bias support for marlin kernel (#22428)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Animesh Jain <anijain@umich.edu> Signed-off-by: Rui Qiao <ruisearch42@gmail.com> Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com> Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: kf <kuanfu.liu@embeddedllm.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: tjtanaavllm <tunjian.tan@amd.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: yan <yan.ma@intel.com> Signed-off-by: Yan Ma <yan.ma@intel.com> Signed-off-by: Xiao Liu <xiszishu@gmail.com> Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com> Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es> Signed-off-by: Andy Xie <andy.xning@gmail.com> Signed-off-by: Haibin Lin <haibin.lin@bytedance.com> Signed-off-by: David Ben-David <davidb@pliops.com> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Seiji Eicher <seiji@anyscale.com> Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com> Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Signed-off-by: Abirdcfly <fp544037857@gmail.com> Signed-off-by: Giancarlo Delfin <gdelfin@meta.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: huangweixiao <huangweixiao@msh.team> Signed-off-by: alyosha-swamy <raghav@arcee.ai> Signed-off-by: Eric Hanley <ericehanley@google.com> Signed-off-by: Abatom <abzhonghua@gmail.com> Signed-off-by: CLFutureX <775523362@qq.com> Signed-off-by: Linkun Chen <github@lkchen.net> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: tlipoca9 <tlipoca9@gmail.com> Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com> Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Benji Beck <benjibeck@meta.com> Signed-off-by: Siyuan Liu <lsiyuan@google.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Zhang Jason <ning.zhang2@amd.com> Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Signed-off-by: asafg <asafg@ai21.com> Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Signed-off-by: Lain <fusiyuan2000@hotmail.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: QscQ <qscqesze@gmail.com> Signed-off-by: qingjun <qingjun@minimaxi.com> Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk> Signed-off-by: Lionel Villard <villard@us.ibm.com> Signed-off-by: ycyaw66 <497410282@qq.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: Linkun <github@lkchen.net> Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai> Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com> Signed-off-by: Ricardo Decal <rdecal@anyscale.com> Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com> Signed-off-by: Felix Marty <Felix.Marty@amd.com> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com> Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Po-Han Huang <pohanh@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: XIn Li <xinli@nvidia.com> Signed-off-by: Junhao Li <junhao@ubicloud.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com> Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Signed-off-by: <zyy1102000@gmail.com> Signed-off-by: Guy Stone <guys@spotify.com> Signed-off-by: <yyweiss@gmail.com> Signed-off-by: yyw <yyweiss@gmail.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com> Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com> Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Animesh Jain <jainanimesh2305@yahoo.com> Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Co-authored-by: XiongfeiWei <isaacwxf23@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JartX <sagformas@gmail.com> Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: kf <kuanfu.liu@embeddedllm.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: tjtanaavllm <tunjian.tan@amd.com> Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Co-authored-by: Yuxuan Zhang <2448370773@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Yan Ma <yan.ma@intel.com> Co-authored-by: Xiao <xiszishu@gmail.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: Ning Xie <andy.xning@gmail.com> Co-authored-by: H <linhaibin.eric@gmail.com> Co-authored-by: David Ben-David <sdavidbd@gmail.com> Co-authored-by: David Ben-David <davidb@pliops.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: TankNee <nee@tanknee.cn> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Co-authored-by: ZiTian.Zhao <zitian.zhao@tencentmusic.com> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Abirdcfly <fp544037857@gmail.com> Co-authored-by: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@meta.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Weixiao Huang <hwx.simle@gmail.com> Co-authored-by: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com> Co-authored-by: ericehanley <ericehanley@google.com> Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com> Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com> Co-authored-by: PiteXChen <44110731+CLFutureX@users.noreply.github.com> Co-authored-by: lkchen <github@lkchen.net> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: tlipoca9 <160737620+tlipoca9@users.noreply.github.com> Co-authored-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Benji Beck <benjibeck@meta.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Siyuan Liu <lsiyuan@google.com> Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com> Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> Co-authored-by: Yongye Zhu <zyy1102000@gmail.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Zhang Jason <ning.zhang2@amd.com> Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Co-authored-by: asafg <asafg@ai21.com> Co-authored-by: Lain <siyuanf@nvidia.com> Co-authored-by: tc-mb <157115220+tc-mb@users.noreply.github.com> Co-authored-by: imning3 <hbning@pku.edu.cn> Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: qscqesze <qingjun@minimaxi.com> Co-authored-by: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com> Co-authored-by: Lionel Villard <villard@us.ibm.com> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: ycyaw66 <497410282@qq.com> Co-authored-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> Co-authored-by: Ming Yang <minos.future@gmail.com> Co-authored-by: Adrián García García <adrigarvk8@gmail.com> Co-authored-by: Michael Goin <mgoin@redhat.com> Co-authored-by: JaceyShao <65159281+JaceyShao@users.noreply.github.com> Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com> Co-authored-by: Ricardo Decal <crypdick@users.noreply.github.com> Co-authored-by: Andrew Chan <andrewkchan.akc@gmail.com> Co-authored-by: fxmarty-amd <felmarty@amd.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: Zhiyu <zhiyuc@nvidia.com> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: XIn Li <xinli@nvidia.com> Co-authored-by: Junhao Li <streaver91@gmail.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com> Co-authored-by: Hong Hanh <hanh.usth@gmail.com> Co-authored-by: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Guy Stone <guys@spotify.com> Co-authored-by: yyweiss <70619747+yyweiss@users.noreply.github.com> Co-authored-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2025-08-15 02:23:22 +08:00
parent ab9f2cfd19
commit 33c63e9547
34 changed files with 1128 additions and 324 deletions
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
    check_marlin_supports_layer, check_moe_marlin_supports_layer,
    marlin_make_empty_g_idx, marlin_make_workspace_new,
-    marlin_moe_permute_scales, marlin_permute_scales,
+    marlin_moe_permute_scales, marlin_permute_bias, marlin_permute_scales,
    moe_awq_to_marlin_zero_points, verify_marlin_supported,
    verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -303,6 +303,9 @@ class AWQMarlinLinearMethod(LinearMethodBase):
        layer.g_idx = marlin_make_empty_g_idx(device)
        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)

+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+
    def apply(
        self,
        layer: torch.nn.Module,
@@ -469,6 +472,12 @@ class AWQMoEMethod(FusedMoEMethodBase):
            num_bits=self.quant_config.weight_bits)
        replace_parameter(layer, "w2_qzeros", marlin_w2_zp)

+        if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
+            layer.w13_bias.data = marlin_permute_bias(layer.w13_bias)
+
+        if hasattr(layer, "w2_bias") and layer.w2_bias is not None:
+            layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
+
    def apply(
        self,
        layer: torch.nn.Module,
@@ -513,6 +522,8 @@ class AWQMoEMethod(FusedMoEMethodBase):
            x,
            layer.w13_qweight,
            layer.w2_qweight,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
            layer.w13_scales,
            layer.w2_scales,
            router_logits,
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -324,6 +324,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                x,
                layer.w13_weight,
                layer.w2_weight,
+                None,
+                None,
                layer.w13_weight_scale,
                layer.w2_weight_scale,
                router_logits,
@@ -795,6 +797,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                x,
                layer.w13_weight,
                layer.w2_weight,
+                None,
+                None,
                layer.w13_weight_scale,
                layer.w2_weight_scale,
                router_logits,
@@ -1253,6 +1257,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
            x,
            layer.w13_weight_packed,
            layer.w2_weight_packed,
+            None,
+            None,
            layer.w13_weight_scale,
            layer.w2_weight_scale,
            router_logits,
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -983,6 +983,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                x,
                layer.w13_weight,
                layer.w2_weight,
+                None,
+                None,
                layer.w13_weight_scale,
                layer.w2_weight_scale,
                router_logits,
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization.utils.gptq_utils import (
    get_dynamic_override, get_linear_quant_method, override_config)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    check_marlin_supported, check_moe_marlin_supports_layer,
-    marlin_make_workspace_new, marlin_moe_permute_scales,
+    marlin_make_workspace_new, marlin_moe_permute_scales, marlin_permute_bias,
    marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                           GroupQuantScaleParameter,
@@ -618,6 +618,12 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
        )
        replace_parameter(layer, "w2_scales", marlin_w2_scales)

+        if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
+            layer.w13_bias.data = marlin_permute_bias(layer.w13_bias)
+
+        if hasattr(layer, "w2_bias") and layer.w2_bias is not None:
+            layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
+
    def apply(
        self,
        layer: torch.nn.Module,
@@ -662,6 +668,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
            x,
            layer.w13_qweight,
            layer.w2_qweight,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
            layer.w13_scales,
            layer.w2_scales,
            router_logits,
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    marlin_make_empty_g_idx, marlin_permute_scales)
+    marlin_make_empty_g_idx, marlin_permute_bias, marlin_permute_scales)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    MarlinWorkspace)
 from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
@@ -284,6 +284,9 @@ class HQQMarlinMethod(LinearMethodBase):
        layer.marlin_zeros = marlin_zp
        layer.marlin_scales = marlin_s

+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+
    def apply(
        self,
        layer: torch.nn.Module,
@@ -307,6 +310,7 @@ class HQQMarlinMethod(LinearMethodBase):
            x,
            None,
            layer.marlin_qweight,
+            bias,
            scales,
            None,
            zeros,
@@ -326,7 +330,4 @@ class HQQMarlinMethod(LinearMethodBase):
        if orig_type != torch.float16:
            marlin_out = marlin_out.to(orig_type)

-        if bias is not None:
-            marlin_out.add_(bias)
-
        return marlin_out
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -9,8 +9,9 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
    check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
-    marlin_make_workspace_new, marlin_permute_scales, marlin_sort_g_idx,
-    marlin_zero_points, query_marlin_supported_quant_types, unpack_cols)
+    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
+    marlin_sort_g_idx, marlin_zero_points, query_marlin_supported_quant_types,
+    unpack_cols)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                           permute_param_layout_)
 from vllm.platforms import current_platform
@@ -111,6 +112,9 @@ class MarlinLinearKernel(MPLinearKernel):
        self._transform_param(layer, self.w_q_name, transform_w_q)
        self._transform_param(layer, self.w_s_name, transform_w_s)

+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+
    def apply_weights(self,
                      layer: torch.nn.Module,
                      x: torch.Tensor,
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1330,6 +1330,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                x,
                layer.w13_weight,
                layer.w2_weight,
+                None,
+                None,
                layer.w13_weight_scale,
                layer.w2_weight_scale,
                router_logits,
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -15,13 +15,17 @@ from vllm.model_executor.layers.linear import (LinearBase,
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    prepare_moe_fp4_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
    _can_support_mxfp4, _swizzle_mxfp4)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    is_layer_skipped)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import next_power_of_2, round_up
+from vllm.scalar_type import scalar_types
+from vllm.utils import (has_triton_kernels, is_torch_equal_or_newer,
+                        next_power_of_2, round_up)

 if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
@@ -81,6 +85,21 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
        super().__init__()
        self.topk_indices_dtype = None
        self.moe = moe
+        self.use_marlin = self._should_use_marlin()
+
+    def _should_use_marlin(self):
+        if envs.VLLM_MXFP4_USE_MARLIN is not None:
+            return envs.VLLM_MXFP4_USE_MARLIN
+        if current_platform.is_cuda() and \
+                not current_platform.has_device_capability(100):
+            if not current_platform.is_device_capability(90):
+                # marlin kernel has better performance on ampere
+                return True
+            if not has_triton_kernels():
+                return True
+            if not is_torch_equal_or_newer("2.8.0"):
+                return True
+        return False

    def create_weights(self, layer: torch.nn.Module, num_experts: int,
                       hidden_size: int, intermediate_size_per_partition: int,
@@ -101,11 +120,29 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):

        intermediate_size_per_partition_after_pad = \
            intermediate_size_per_partition
-        # pad the intermediate size to be a multiple of 2 * mxfp4_block
-        # for to hold non-uniform sharded tensor as well as swizzling
-        # other padding to increase performance
-        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if self.use_marlin:
+            # The moe marlin kernel requires that for each linear
+            # n % 256 == 0 and k % 128 == 0.
+            # In gate_up_proj:
+            #    n = 2 * intermediate_size_per_partition_after_pad
+            #    k = hidden_size
+            # In down_proj
+            #    n = hidden_size
+            #    k = intermediate_size_per_partition_after_pad
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, 128)
+            hidden_size = round_up(hidden_size, 256)
+
+            layer.params_dtype = params_dtype
+            layer.num_experts = num_experts
+            layer.hidden_size = hidden_size
+            layer.intermediate_size_per_partition = \
+                intermediate_size_per_partition_after_pad
+        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+            # pad the intermediate size to be a multiple of 2 * mxfp4_block
+            # for to hold non-uniform sharded tensor as well as swizzling
+            # other padding to increase performance
            intermediate_size_per_partition_after_pad = round_up(
                intermediate_size_per_partition, 256)
            hidden_size = round_up(hidden_size, 256)
@@ -191,8 +228,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
        set_weight_attrs(w2_bias, extra_weight_attrs)

    def process_weights_after_loading(self, layer):
-        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if self.use_marlin:
+            prepare_moe_fp4_layer_for_marlin(layer)
+        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
            layer.gemm1_alpha = Parameter(torch.tensor(
                [1.702] * self.num_experts, dtype=torch.float32).cuda(),
                                          requires_grad=False)
@@ -399,13 +438,45 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
        if enable_eplb:
            raise NotImplementedError("EPLB is not supported for mxfp4")

+        if self.use_marlin:
+            topk_weights, topk_ids = FusedMoE.select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias)
+
+            return torch.ops.vllm.fused_marlin_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_bias,
+                layer.w2_bias,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                router_logits,
+                topk_weights,
+                topk_ids,
+                global_scale1=None,
+                global_scale2=None,
+                quant_type_id=scalar_types.float4_e2m1f.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                activation=activation,
+                expert_map=expert_map)
+
        assert _can_support_mxfp4(
            use_grouped_topk, topk_group, num_expert_group, expert_map,
            custom_routing_function, e_score_correction_bias,
            apply_router_weight_on_input, scoring_func, activation,
            expert_load_view, logical_to_physical_map,
-            logical_replica_count), ("MXFP4 are not supported\
-                                      with this configuration.")
+            logical_replica_count), (
+                "MXFP4 are not supported with this configuration.")

        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -261,6 +261,13 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
    return s


+def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
+    origin_shape = s.shape
+    _, scale_perm_single = get_scale_perms()
+    s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    return s.reshape(*origin_shape).contiguous()
+
+
 def marlin_moe_permute_scales(
    s: torch.Tensor,
    size_k: int,
@@ -410,6 +417,7 @@ def apply_gptq_marlin_linear(
    output = ops.gptq_marlin_gemm(reshaped_x,
                                  None,
                                  weight,
+                                  bias,
                                  weight_scale,
                                  None,
                                  weight_zp,
@@ -425,9 +433,6 @@ def apply_gptq_marlin_linear(
                                  use_fp32_reduce=use_fp32_reduce,
                                  is_zp_float=False)

-    if bias is not None:
-        output.add_(bias)  # In-place add
-
    return output.reshape(out_shape)


@@ -456,6 +461,7 @@ def apply_awq_marlin_linear(
    output = ops.gptq_marlin_gemm(reshaped_x,
                                  None,
                                  weight,
+                                  bias,
                                  weight_scale,
                                  None,
                                  weight_zp,
@@ -470,7 +476,4 @@ def apply_awq_marlin_linear(
                                  use_fp32_reduce=use_fp32_reduce,
                                  is_zp_float=False)

-    if bias is not None:
-        output.add_(bias)  # In-place add
-
    return output.reshape(out_shape)
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -8,8 +8,8 @@ import torch
 import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
-    should_use_atomic_add_reduce)
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
+    marlin_permute_scales, should_use_atomic_add_reduce)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types

@@ -22,7 +22,7 @@ def is_fp4_marlin_supported():
    return current_platform.has_device_capability(80)


-def fp4_marlin_process_scales(marlin_scales):
+def nvfp4_marlin_process_scales(marlin_scales):
    if not (marlin_scales >= 0).all():
        logger.warning_once(
            "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
@@ -56,7 +56,20 @@ def fp4_marlin_process_scales(marlin_scales):
    return marlin_scales


-def fp4_marlin_process_global_scale(global_scale):
+def mxfp4_marlin_process_scales(marlin_scales):
+    # 8 is the number of scale number using by one thread
+    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
+    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
+        marlin_scales.size(0) * 2, -1)
+
+    # fit the layout of fp8 dequantization
+    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+        marlin_scales.size(0), -1)
+    marlin_scales = marlin_scales.to(torch.float8_e8m0fnu)
+    return marlin_scales
+
+
+def nvfp4_marlin_process_global_scale(global_scale):
    assert global_scale.dtype in [torch.half, torch.bfloat16]
    fp4_exponent = 2
    if global_scale.dtype == torch.half:
@@ -73,7 +86,7 @@ def apply_fp4_marlin_linear(
        input: torch.Tensor,
        weight: torch.Tensor,
        weight_scale: torch.Tensor,
-        weight_scale_2: torch.Tensor,
+        weight_scale_2: Optional[torch.Tensor],
        workspace: torch.Tensor,
        size_n: int,
        size_k: int,
@@ -94,6 +107,7 @@ def apply_fp4_marlin_linear(
    output = ops.gptq_marlin_gemm(a=reshaped_x,
                                  c=None,
                                  b_q_weight=weight,
+                                  b_bias=bias,
                                  b_scales=weight_scale,
                                  global_scale=weight_scale_2,
                                  b_zeros=None,
@@ -107,9 +121,6 @@ def apply_fp4_marlin_linear(
                                  use_atomic_add=use_atomic_add,
                                  use_fp32_reduce=use_fp32_reduce)

-    if bias is not None:
-        output.add_(bias)  # In-place add
-
    return output.reshape(out_shape)


@@ -120,6 +131,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
        "be used leveraging the Marlin kernel. This may degrade "
        "performance for compute-heavy workloads.")

+    is_nvfp4 = hasattr(layer, "weight_scale_2")
+    group_size = 16 if is_nvfp4 else 32
+
    part_size_n = layer.output_size_per_partition
    part_size_k = layer.input_size_per_partition
    param_dtype = layer.params_dtype
@@ -145,18 +159,35 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:

    # WEIGHT SCALES
    # Permute scales
-    weight_scale = layer.weight_scale.T.to(param_dtype)
+    weight_scale = layer.weight_scale.T.contiguous()
+
+    if not is_nvfp4:
+        weight_scale = weight_scale.view(torch.float8_e8m0fnu)
+
+    weight_scale = weight_scale.to(param_dtype)
    weight_scale = marlin_permute_scales(s=weight_scale,
                                         size_k=part_size_k,
                                         size_n=part_size_n,
-                                         group_size=16)
-    weight_scale = fp4_marlin_process_scales(weight_scale)
-    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+                                         group_size=group_size)

-    weight_scale_2 = layer.weight_scale_2.to(param_dtype)
-    weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
-    layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
-                                              requires_grad=False)
+    if is_nvfp4:
+        weight_scale = nvfp4_marlin_process_scales(weight_scale)
+        layer.weight_scale = torch.nn.Parameter(weight_scale,
+                                                requires_grad=False)
+
+        weight_scale_2 = layer.weight_scale_2.to(param_dtype)
+        weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2)
+        layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
+                                                  requires_grad=False)
+    else:
+        weight_scale = mxfp4_marlin_process_scales(weight_scale)
+        layer.weight_scale = torch.nn.Parameter(weight_scale,
+                                                requires_grad=False)
+
+    if hasattr(layer, "bias") and layer.bias is not None:
+        assert layer.bias.shape == (part_size_n, )
+        bias = marlin_permute_bias(layer.bias)
+        layer.bias = torch.nn.Parameter(bias, requires_grad=False)

    return

@@ -168,6 +199,9 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
        "be used leveraging the Marlin kernel. This may degrade "
        "performance for compute-heavy workloads.")

+    is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
+    group_size = 16 if is_nvfp4 else 32
+
    e = layer.num_experts
    k = layer.hidden_size
    n = layer.intermediate_size_per_partition
@@ -208,8 +242,13 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
    # WEIGHT SCALES
    # Permute scales
    for name in ["w13", "w2"]:
-        scales = getattr(layer, name + "_weight_scale").to(param_dtype)
-        global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
+        scales = getattr(layer, name + "_weight_scale")
+        if not is_nvfp4:
+            scales = scales.view(torch.float8_e8m0fnu)
+        scales = scales.to(param_dtype)
+        if is_nvfp4:
+            global_scale = getattr(layer,
+                                   name + "_weight_scale_2").to(param_dtype)

        tensor_list = []
        if "w13" in name:
@@ -218,23 +257,47 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
            size_n, size_k = k, n

        for i in range(e):
-            marlin_scales = marlin_permute_scales(s=scales[i].T,
+            scale = scales[i].T
+
+            marlin_scales = marlin_permute_scales(s=scale,
                                                  size_k=size_k,
                                                  size_n=size_n,
-                                                  group_size=16)
-            marlin_scales = fp4_marlin_process_scales(marlin_scales)
+                                                  group_size=group_size)
+            if is_nvfp4:
+                marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
+            else:
+                marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
            tensor_list.append(marlin_scales)

        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
        scales = torch.nn.Parameter(scales, requires_grad=False)
        setattr(layer, name + "_weight_scale", scales)

-        global_scale = fp4_marlin_process_global_scale(global_scale)
-        global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
-        setattr(layer, name + "_weight_scale_2", global_scale)
+        if is_nvfp4:
+            global_scale = nvfp4_marlin_process_global_scale(global_scale)
+            global_scale = torch.nn.Parameter(global_scale,
+                                              requires_grad=False)
+            setattr(layer, name + "_weight_scale_2", global_scale)
+
+    # BIAS
+    # Permute bias
+    for name in ["w13_bias", "w2_bias"]:
+        if not hasattr(layer, name):
+            continue
+        bias = getattr(layer, name).to(param_dtype)
+
+        tensor_list = []
+        for i in range(e):
+            expert_bias = bias[i]
+
+            tensor_list.append(marlin_permute_bias(expert_bias))
+
+        bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        bias = torch.nn.Parameter(bias, requires_grad=False)
+        setattr(layer, name, bias)


-def rand_marlin_weight_fp4_like(weight, group_size):
+def rand_marlin_weight_nvfp4_like(weight, group_size):
    assert group_size > 0
    size_n, size_k = weight.shape
    device = weight.device
@@ -276,8 +339,58 @@ def rand_marlin_weight_fp4_like(weight, group_size):
                                          size_k=size_k,
                                          size_n=size_n,
                                          group_size=group_size)
-    marlin_scales = fp4_marlin_process_scales(marlin_scales)
+    marlin_scales = nvfp4_marlin_process_scales(marlin_scales)

-    global_scale = fp4_marlin_process_global_scale(global_scale)
+    global_scale = nvfp4_marlin_process_global_scale(global_scale)

    return weight_ref.T, marlin_qweight, marlin_scales, global_scale
+
+
+def rand_marlin_weight_mxfp4_like(weight, group_size):
+    assert group_size > 0
+    size_n, size_k = weight.shape
+    device = weight.device
+
+    scales = torch.randint(100,
+                           125, (size_n, size_k // group_size),
+                           dtype=torch.uint8,
+                           device=weight.device)
+    scales = scales.view(torch.float8_e8m0fnu)
+
+    fp4_weight = torch.randint(0,
+                               256, (size_n, size_k // 2),
+                               dtype=torch.uint8,
+                               device=weight.device)
+    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
+                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
+    fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
+
+    fp4_weight2 = fp4_weight << 4
+    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
+                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
+    fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
+
+    weight_ref = torch.cat(
+        [fp4_weight_part_2.unsqueeze(2),
+         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
+    weight_ref = weight_ref * \
+        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=4,
+    )
+
+    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+
+    marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
+
+    return weight_ref.T, marlin_qweight, marlin_scales.to(torch.float8_e8m0fnu)
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -8,8 +8,8 @@ import torch
 import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
-    should_use_atomic_add_reduce)
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
+    marlin_permute_scales, should_use_atomic_add_reduce)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types

@@ -58,6 +58,7 @@ def apply_fp8_marlin_linear(
    output = ops.gptq_marlin_gemm(a=reshaped_x,
                                  c=None,
                                  b_q_weight=weight,
+                                  b_bias=bias,
                                  b_scales=weight_scale,
                                  global_scale=None,
                                  b_zeros=None,
@@ -71,9 +72,6 @@ def apply_fp8_marlin_linear(
                                  use_atomic_add=use_atomic_add,
                                  use_fp32_reduce=use_fp32_reduce)

-    if bias is not None:
-        output.add_(bias)  # In-place add
-
    return output.reshape(out_shape)


@@ -160,6 +158,11 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)

+    if hasattr(layer, "bias") and layer.bias is not None:
+        assert layer.bias.shape == (part_size_n, )
+        bias = marlin_permute_bias(layer.bias)
+        layer.bias = torch.nn.Parameter(bias, requires_grad=False)
+

 def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
                                     size_k_first: bool = True) -> None:
@@ -274,6 +277,23 @@ def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,

        setattr(layer, name + "_weight_scale", scales)

+    # BIAS
+    # Permute bias
+    for name in ["w13_bias", "w2_bias"]:
+        if not hasattr(layer, name):
+            continue
+        bias = getattr(layer, name).to(layer.orig_dtype)
+
+        tensor_list = []
+        for i in range(e):
+            expert_bias = bias[i]
+
+            tensor_list.append(marlin_permute_bias(expert_bias))
+
+        bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        bias = torch.nn.Parameter(bias, requires_grad=False)
+        setattr(layer, name, bias)
+

 def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
                      size_k_first: bool = True) -> torch.Tensor:
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -61,7 +61,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
                       e_score_correction_bias: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       scoring_func: str = "softmax",
-                       activation: str = "silu",
+                       activation: str = "swiglu_oai",
                       expert_load_view: Optional[torch.Tensor] = None,
                       logical_to_physical_map: Optional[torch.Tensor] = None,
                       logical_replica_count: Optional[torch.Tensor] = None):