[Release] Fix format and cherry-pick (#33618)

Signed-off-by: zhewenli <zhewen@inferact.ai> Co-authored-by: zhewenli <zhewen@inferact.ai>
2026-02-02 16:19:05 -08:00
parent 57eae2f891
commit 31a64c63a8
2 changed files with 1 additions and 68 deletions
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@@ -1,67 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
-
-import vllm.envs as envs
-from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
-    is_flashinfer_fp4_cutedsl_moe_available,
-    is_flashinfer_fp4_cutlass_moe_available,
-)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    is_fp4_marlin_supported,
-)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported,
-)
-
-__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"]
-
-_logger = init_logger(__name__)
-
-
-@dataclass(frozen=True)
-class NvFp4Support:
-    """Result container for NV-FP4 capability probing."""
-
-    cutlass_supported: bool
-    allow_flashinfer: bool
-    use_marlin: bool
-
-
-def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
-    """Detect platform support for NV-FP4 fused-MoE path"""
-    cutlass_supported = cutlass_fp4_supported()
-
-    allow_flashinfer = cutlass_supported and (
-        is_flashinfer_fp4_cutlass_moe_available()
-        or is_flashinfer_fp4_cutedsl_moe_available()
-    )
-
-    if allow_flashinfer:
-        _logger.info_once(
-            "Using FlashInfer kernels for %s.", class_name or "NVFP4 path"
-        )
-    else:
-        if envs.VLLM_USE_FLASHINFER_MOE_FP4:
-            _logger.warning_once(
-                "FlashInfer kernels unavailable for %s on current platform.",
-                class_name or "NVFP4 path",
-            )
-
-    use_marlin = False
-    if not cutlass_supported:
-        if is_fp4_marlin_supported():
-            use_marlin = True
-            _logger.info_once("Falling back to Marlin FP4 MoE kernel.")
-        else:
-            raise ValueError(
-                "Current platform does not support NVFP4 quantization. "
-                "Please use Blackwell GPUs or enable FlashInfer."
-            )
-
-    return NvFp4Support(
-        cutlass_supported=cutlass_supported,
-        allow_flashinfer=allow_flashinfer,
-        use_marlin=use_marlin,
-    )
--- a/vllm/model_executor/models/step3p5.py
+++ b/vllm/model_executor/models/step3p5.py
@@ -9,6 +9,7 @@ import torch
 from torch import nn
 from torch.nn.parameter import Parameter

+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
@@ -21,7 +22,6 @@ from vllm.distributed import (
 )
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul, SwigluStepAndMul
-from vllm.attention.layer import Attention
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm