[Feature] Support sequence parallelism for static fp8 quantization (#19181)

Signed-off-by: cascade812 <cascade812@outlook.com>
2025-06-23 13:09:02 -07:00
parent d0132f025d
commit e6327c9b3e
7 changed files with 531 additions and 195 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3802,11 +3802,11 @@ class PassConfig:
    its own stages (before, after, maybe in-between)."""
    dump_graph_dir: Path = Path(".")
    """Directory to dump the graphs."""
-    enable_fusion: bool = True
+    enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
    """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
    enable_attn_fusion: bool = False
    """Whether to enable the custom attention+quant fusion pass."""
-    enable_noop: bool = True
+    enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
    """Whether to enable the custom no-op elimination pass."""
    enable_sequence_parallelism: bool = False
    """Whether to enable sequence parallelism."""
@@ -4451,8 +4451,6 @@ class VllmConfig:
            # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
            # is set to True, full CUDA graphs will be used.
            self.compilation_config.cudagraph_num_of_warmups = 1
-            self.compilation_config.pass_config.enable_fusion = False
-            self.compilation_config.pass_config.enable_noop = False
            self.compilation_config.level = CompilationLevel.PIECEWISE
            self.compilation_config.set_splitting_ops_for_v1()