[Feature] Support sequence parallelism for static fp8 quantization (#19181)
Signed-off-by: cascade812 <cascade812@outlook.com>
This commit is contained in:
@@ -3802,11 +3802,11 @@ class PassConfig:
|
||||
its own stages (before, after, maybe in-between)."""
|
||||
dump_graph_dir: Path = Path(".")
|
||||
"""Directory to dump the graphs."""
|
||||
enable_fusion: bool = True
|
||||
enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
|
||||
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
|
||||
enable_attn_fusion: bool = False
|
||||
"""Whether to enable the custom attention+quant fusion pass."""
|
||||
enable_noop: bool = True
|
||||
enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
|
||||
"""Whether to enable the custom no-op elimination pass."""
|
||||
enable_sequence_parallelism: bool = False
|
||||
"""Whether to enable sequence parallelism."""
|
||||
@@ -4451,8 +4451,6 @@ class VllmConfig:
|
||||
# By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
|
||||
# is set to True, full CUDA graphs will be used.
|
||||
self.compilation_config.cudagraph_num_of_warmups = 1
|
||||
self.compilation_config.pass_config.enable_fusion = False
|
||||
self.compilation_config.pass_config.enable_noop = False
|
||||
self.compilation_config.level = CompilationLevel.PIECEWISE
|
||||
self.compilation_config.set_splitting_ops_for_v1()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user