[Feat] Support non-gated MoE with Marlin, NVFP4 CUTLASS, FP8, INT8, compressed-tensors (#32257)

Signed-off-by: Tomer Natan <tbarnatan@computelab-frontend-8.nvidia.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Tomer Natan <tbarnatan@computelab-frontend-8.nvidia.com> Co-authored-by: mgoin <mgoin64@gmail.com> Co-authored-by: Tomer Natan <tbarnatan@ipp1-1429.ipp1a1.colossus.nvidia.com>
2026-01-16 02:15:05 +02:00
parent aca5c51487
commit c277fbdf31
17 changed files with 226 additions and 127 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1451,6 +1451,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # - "flashinfer-cudnn": use flashinfer cudnn GEMM backend
    # - "flashinfer-trtllm": use flashinfer trtllm GEMM backend
    # - "flashinfer-cutlass": use flashinfer cutlass GEMM backend
+    # - "marlin": use marlin GEMM backend (for GPUs without native FP4 support)
    # - <none>: automatically pick an available backend
    "VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
        "VLLM_NVFP4_GEMM_BACKEND",
@@ -1460,6 +1461,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
            "flashinfer-trtllm",
            "flashinfer-cutlass",
            "cutlass",
+            "marlin",
        ],
    ),
    # Controls garbage collection during CUDA graph capture.