[Feature] Support sequence parallelism for static fp8 quantization (#19181)

Signed-off-by: cascade812 <cascade812@outlook.com>
This commit is contained in:
cascade
2025-06-23 13:09:02 -07:00
committed by GitHub
parent d0132f025d
commit e6327c9b3e
7 changed files with 531 additions and 195 deletions

View File

@@ -3802,11 +3802,11 @@ class PassConfig:
its own stages (before, after, maybe in-between)."""
dump_graph_dir: Path = Path(".")
"""Directory to dump the graphs."""
enable_fusion: bool = True
enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
enable_attn_fusion: bool = False
"""Whether to enable the custom attention+quant fusion pass."""
enable_noop: bool = True
enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
"""Whether to enable the custom no-op elimination pass."""
enable_sequence_parallelism: bool = False
"""Whether to enable sequence parallelism."""
@@ -4451,8 +4451,6 @@ class VllmConfig:
# By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
# is set to True, full CUDA graphs will be used.
self.compilation_config.cudagraph_num_of_warmups = 1
self.compilation_config.pass_config.enable_fusion = False
self.compilation_config.pass_config.enable_noop = False
self.compilation_config.level = CompilationLevel.PIECEWISE
self.compilation_config.set_splitting_ops_for_v1()