[CI][torch.compile] Reduce e2e fusion test time (#33293)
Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
@@ -604,9 +604,11 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
# Limit to no custom ops to reduce running time
|
# # Limit to no custom ops to reduce running time
|
||||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||||
|
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||||
|
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||||
|
|
||||||
- label: Cudagraph test
|
- label: Cudagraph test
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -1181,7 +1183,6 @@ steps:
|
|||||||
- tests/compile/test_fusion_attn.py
|
- tests/compile/test_fusion_attn.py
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
@@ -1189,33 +1190,16 @@ steps:
|
|||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
|
||||||
# Wrap with quotes to escape yaml
|
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
# # Wrap with quotes to escape yaml
|
||||||
|
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
|
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||||
|
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||||
|
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
- label: Blackwell Fusion E2E Tests # 30 min
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
# Run all e2e fusion tests
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
@@ -1566,7 +1550,10 @@ steps:
|
|||||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
# - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
|
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||||
|
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||||
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
||||||
|
|||||||
@@ -537,9 +537,11 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
# Limit to no custom ops to reduce running time
|
# # Limit to no custom ops to reduce running time
|
||||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||||
|
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||||
|
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||||
|
|
||||||
- label: Cudagraph test
|
- label: Cudagraph test
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -1069,7 +1071,6 @@ steps:
|
|||||||
- tests/compile/test_fusion_attn.py
|
- tests/compile/test_fusion_attn.py
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
@@ -1077,75 +1078,15 @@ steps:
|
|||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
# Wrap with quotes to escape yaml
|
# # Wrap with quotes to escape yaml
|
||||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
|
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||||
|
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||||
|
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
- label: Blackwell Fusion E2E Tests # 30 min
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
# Run all e2e fusion tests
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
|
|
||||||
- label: Hopper Fusion E2E Tests (H100) # 10min
|
|
||||||
timeout_in_minutes: 70
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/test_fusion_attn.py
|
|
||||||
commands:
|
|
||||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
||||||
# skip Llama-4 since it does not fit on this device
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
|
|
||||||
|
|
||||||
- label: Hopper Fusion Distributed E2E Tests (2xH100) # 70min
|
|
||||||
timeout_in_minutes: 70
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
commands:
|
|
||||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
||||||
# Run all e2e fusion tests
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
|
|||||||
@@ -2,56 +2,196 @@ group: Compile
|
|||||||
depends_on:
|
depends_on:
|
||||||
- image-build
|
- image-build
|
||||||
steps:
|
steps:
|
||||||
- label: Fusion and Compile Tests (B200)
|
- label: Sequence Parallel Tests (2 GPUs)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers/
|
||||||
|
- vllm/compilation/
|
||||||
|
- vllm/v1/worker/
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- tests/distributed/test_sequence_parallel.py
|
||||||
|
commands:
|
||||||
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
|
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
|
|
||||||
|
- label: Sequence Parallel Tests (2xH100)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
commands:
|
||||||
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
|
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
|
|
||||||
|
- label: Distributed Compile Unit Tests (2xH100)
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/compilation/
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
- tests/compile/distributed/test_sequence_parallelism.py
|
||||||
|
- tests/compile/distributed/test_async_tp.py
|
||||||
|
commands:
|
||||||
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||||
|
|
||||||
|
- label: Fusion and Compile Unit Tests (B200)
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
device: b200
|
device: b200
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/v1/worker/
|
|
||||||
- vllm/v1/cudagraph_dispatcher.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/attention/attention.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
|
||||||
- tests/compile/test_fusion_attn.py
|
- tests/compile/test_fusion_attn.py
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
commands:
|
commands:
|
||||||
|
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
- pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
# this runner has 2 GPUs available even though num_devices=2 is not set
|
# this runner has 2 GPUs available even though num_devices=2 is not set
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
|
||||||
# Wrap with quotes to escape yaml
|
|
||||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
|
# TODO(luka) move to H100 once pass tests run on H100
|
||||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
- label: Fusion E2E (2 GPUs)(B200)
|
- label: Fusion E2E Quick (H100)
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 15
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
device: b200
|
device: h100
|
||||||
optional: true
|
num_devices: 1
|
||||||
num_devices: 2
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/quantization/fp4/
|
- csrc/quantization/
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/
|
||||||
- vllm/compilation/
|
- vllm/compilation/
|
||||||
# can affect pattern matching
|
- tests/compile/fusions_e2e/
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all e2e fusion tests
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||||
|
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||||
|
|
||||||
|
- label: Fusion E2E Config Sweep (H100)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 1
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/attention/attention.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run just llama3 (fp8) for all config combinations
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
|
||||||
|
|
||||||
|
- label: Fusion E2E Config Sweep (B200)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: b200
|
||||||
|
num_devices: 1
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||||
|
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||||
|
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||||
|
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||||
|
# Run just llama3 (fp8 & fp4) for all config combinations
|
||||||
|
# -k "llama-3"
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
|
||||||
|
|
||||||
|
- label: Fusion E2E TP2 Quick (H100)
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/model_executor/
|
||||||
|
- vllm/v1/attention/
|
||||||
|
- vllm/compilation/
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||||
|
|
||||||
|
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/attention/attention.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
|
||||||
|
|
||||||
|
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/attention/attention.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run just llama3 (fp8 & bf16) for all config combinations
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
|
||||||
|
|
||||||
|
- label: Fusion E2E TP2 (B200)
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: b200
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/model_executor/
|
||||||
|
- vllm/v1/attention/
|
||||||
|
- vllm/compilation/
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||||
|
# for ar-rms-quant-fp4, also sweep llama3
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ steps:
|
|||||||
- pytest -v -s distributed/test_shm_storage.py
|
- pytest -v -s distributed/test_shm_storage.py
|
||||||
|
|
||||||
- label: Distributed (2 GPUs)
|
- label: Distributed (2 GPUs)
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_devices: 2
|
num_devices: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -47,7 +47,6 @@ steps:
|
|||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
|
|
||||||
@@ -133,25 +132,13 @@ steps:
|
|||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
- label: Sequence Parallel Tests (H100)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
device: h100
|
|
||||||
optional: true
|
|
||||||
num_devices: 2
|
|
||||||
commands:
|
|
||||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
||||||
# Run sequence parallel tests
|
|
||||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
|
||||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs)(H100)
|
- label: Distributed Tests (2 GPUs)(H100)
|
||||||
|
timeout_in_minutes: 15
|
||||||
device: h100
|
device: h100
|
||||||
optional: true
|
optional: true
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_devices: 2
|
num_devices: 2
|
||||||
commands:
|
commands:
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
@@ -217,45 +204,3 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: Hopper Fusion E2E Tests (H100)
|
|
||||||
timeout_in_minutes: 70
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
device: h100
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/test_fusion_attn.py
|
|
||||||
commands:
|
|
||||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
||||||
# skip Llama-4 since it does not fit on this device
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
|
|
||||||
|
|
||||||
- label: Hopper Fusion Distributed E2E Tests (2xH100)
|
|
||||||
timeout_in_minutes: 70
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
device: h100
|
|
||||||
optional: true
|
|
||||||
num_devices: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
commands:
|
|
||||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
||||||
# Run all e2e fusion tests
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ steps:
|
|||||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test
|
- label: PyTorch Fullgraph Smoke Test
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 35
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@@ -30,16 +30,13 @@ steps:
|
|||||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph
|
- label: PyTorch Fullgraph
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 30
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
# Limit to no custom ops to reduce running time
|
|
||||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
|
||||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
|
||||||
|
|
||||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||||
# if this test fails, it means the nightly torch version is not compatible with some
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
|
|||||||
@@ -1,321 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import regex as re
|
|
||||||
|
|
||||||
from tests.compile.fusion_test_utils import (
|
|
||||||
CUSTOM_OPS_FP8,
|
|
||||||
CUSTOM_OPS_QUANT_RMS_NORM,
|
|
||||||
CUSTOM_OPS_RMS_NORM,
|
|
||||||
MODELS,
|
|
||||||
MODELS_FP4,
|
|
||||||
MODELS_FP8,
|
|
||||||
MODELS_GROUP_FP8,
|
|
||||||
Matches,
|
|
||||||
custom_ops_product,
|
|
||||||
is_blackwell,
|
|
||||||
run_model,
|
|
||||||
)
|
|
||||||
from tests.v1.attention.utils import AttentionBackendEnum
|
|
||||||
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils.flashinfer import has_flashinfer
|
|
||||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
|
||||||
|
|
||||||
from ...utils import flat_product, multi_gpu_test
|
|
||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=2)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
|
||||||
# Toggle RMSNorm and QuantFP8 for FP8 models
|
|
||||||
list(
|
|
||||||
flat_product(
|
|
||||||
MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# Toggle RMSNorm for FP4 models and unquant models
|
|
||||||
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
not current_platform.is_cuda()
|
|
||||||
or not has_flashinfer()
|
|
||||||
or not current_platform.has_device_capability(90),
|
|
||||||
reason="allreduce+rmsnorm fusion requires flashinfer",
|
|
||||||
)
|
|
||||||
def test_tp2_attn_quant_allreduce_rmsnorm(
|
|
||||||
model_name: str,
|
|
||||||
model_kwargs: dict,
|
|
||||||
backend: AttentionBackendEnum,
|
|
||||||
matches: Matches,
|
|
||||||
custom_ops: str,
|
|
||||||
inductor_graph_partition: bool,
|
|
||||||
caplog_mp_spawn,
|
|
||||||
monkeypatch,
|
|
||||||
):
|
|
||||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
|
||||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
|
||||||
|
|
||||||
if "fp4" in model_name.lower() and not is_blackwell():
|
|
||||||
pytest.skip("NVFP4 quant requires Blackwell")
|
|
||||||
|
|
||||||
if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
|
|
||||||
# FlashInfer attn fusion requires Blackwell
|
|
||||||
matches = matches._replace(attention_fusion=0)
|
|
||||||
|
|
||||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
|
||||||
|
|
||||||
if inductor_graph_partition:
|
|
||||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
|
||||||
splitting_ops: list[str] | None = None
|
|
||||||
else:
|
|
||||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
|
||||||
splitting_ops = []
|
|
||||||
|
|
||||||
# Disable, compile cache to make sure custom passes run.
|
|
||||||
# Otherwise, we can't verify fusion happened through the logs.
|
|
||||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
|
||||||
|
|
||||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
|
||||||
# Force spawn as it is more general.
|
|
||||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
|
||||||
|
|
||||||
model_kwargs["attention_config"] = {"backend": backend.name}
|
|
||||||
|
|
||||||
compilation_config = CompilationConfig(
|
|
||||||
# Testing properties
|
|
||||||
use_inductor_graph_partition=inductor_graph_partition,
|
|
||||||
cudagraph_mode=mode,
|
|
||||||
custom_ops=custom_ops_list,
|
|
||||||
splitting_ops=splitting_ops,
|
|
||||||
# Common
|
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
|
||||||
pass_config=PassConfig(
|
|
||||||
fuse_attn_quant=True,
|
|
||||||
eliminate_noops=True,
|
|
||||||
fuse_allreduce_rms=True,
|
|
||||||
),
|
|
||||||
# Inductor caches custom passes by default as well via uuid
|
|
||||||
inductor_compile_config={"force_disable_caches": True},
|
|
||||||
)
|
|
||||||
|
|
||||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
|
||||||
run_model(
|
|
||||||
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
|
|
||||||
)
|
|
||||||
log_matches = re.findall(
|
|
||||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
|
||||||
log_holder.text,
|
|
||||||
)
|
|
||||||
# 2 for each compile range
|
|
||||||
# (global compile range can be split due to fuse_allreduce_rmsnorm)
|
|
||||||
num_compile_ranges = len(compilation_config.get_compile_ranges())
|
|
||||||
assert num_compile_ranges in [1, 2]
|
|
||||||
|
|
||||||
assert len(log_matches) == 2 * num_compile_ranges, log_holder.text
|
|
||||||
|
|
||||||
assert all(int(log_match) == matches.attention_fusion for log_match in log_matches)
|
|
||||||
|
|
||||||
log_matches = re.findall(
|
|
||||||
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
|
|
||||||
log_holder.text,
|
|
||||||
)
|
|
||||||
assert len(log_matches) == 2, log_holder.text
|
|
||||||
|
|
||||||
assert int(log_matches[0]) == matches.allreduce_fusion
|
|
||||||
assert int(log_matches[1]) == matches.allreduce_fusion
|
|
||||||
|
|
||||||
log_matches = re.findall(
|
|
||||||
r"pass_manager.py:\d+] Skipping .*AllReduceFusionPass.* with compile range",
|
|
||||||
log_holder.text,
|
|
||||||
)
|
|
||||||
assert len(log_matches) == 2 * (num_compile_ranges - 1), log_holder.text
|
|
||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=2)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
|
||||||
# Toggle RMSNorm and QuantFP8 for FP8 models
|
|
||||||
list(
|
|
||||||
flat_product(
|
|
||||||
MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# Toggle RMSNorm for FP4 models and unquant models
|
|
||||||
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
not current_platform.is_cuda(),
|
|
||||||
reason="sequence parallel only tested on CUDA",
|
|
||||||
)
|
|
||||||
def test_tp2_attn_quant_async_tp(
|
|
||||||
model_name: str,
|
|
||||||
model_kwargs: dict,
|
|
||||||
backend: AttentionBackendEnum,
|
|
||||||
matches: Matches,
|
|
||||||
custom_ops: str,
|
|
||||||
inductor_graph_partition: bool,
|
|
||||||
caplog_mp_spawn,
|
|
||||||
monkeypatch,
|
|
||||||
):
|
|
||||||
if is_blackwell():
|
|
||||||
# TODO: https://github.com/vllm-project/vllm/issues/27893
|
|
||||||
pytest.skip("Blackwell is not supported for AsyncTP pass")
|
|
||||||
|
|
||||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
|
||||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
|
||||||
|
|
||||||
if "fp4" in model_name.lower() and not is_blackwell():
|
|
||||||
pytest.skip("NVFP4 quant requires Blackwell")
|
|
||||||
|
|
||||||
if backend == AttentionBackendEnum.FLASHINFER:
|
|
||||||
if not has_flashinfer():
|
|
||||||
pytest.skip("FlashInfer backend requires flashinfer installed")
|
|
||||||
if not is_blackwell():
|
|
||||||
# FlashInfer attn fusion requires Blackwell
|
|
||||||
matches = matches._replace(attention_fusion=0)
|
|
||||||
|
|
||||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
|
||||||
|
|
||||||
if inductor_graph_partition:
|
|
||||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
|
||||||
splitting_ops: list[str] | None = None
|
|
||||||
else:
|
|
||||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
|
||||||
splitting_ops = []
|
|
||||||
|
|
||||||
# Disable, compile cache to make sure custom passes run.
|
|
||||||
# Otherwise, we can't verify fusion happened through the logs.
|
|
||||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
|
||||||
|
|
||||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
|
||||||
# Force spawn as it is more general.
|
|
||||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
|
||||||
|
|
||||||
model_kwargs["attention_config"] = {"backend": backend.name}
|
|
||||||
|
|
||||||
compilation_config = CompilationConfig(
|
|
||||||
# Testing properties
|
|
||||||
use_inductor_graph_partition=inductor_graph_partition,
|
|
||||||
cudagraph_mode=mode,
|
|
||||||
custom_ops=custom_ops_list,
|
|
||||||
splitting_ops=splitting_ops,
|
|
||||||
# Common
|
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
|
||||||
pass_config=PassConfig(
|
|
||||||
fuse_attn_quant=True,
|
|
||||||
eliminate_noops=True,
|
|
||||||
enable_sp=True,
|
|
||||||
fuse_gemm_comms=True,
|
|
||||||
),
|
|
||||||
# Inductor caches custom passes by default as well via uuid
|
|
||||||
inductor_compile_config={"force_disable_caches": True},
|
|
||||||
)
|
|
||||||
|
|
||||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
|
||||||
run_model(
|
|
||||||
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
|
|
||||||
)
|
|
||||||
log_matches = re.findall(
|
|
||||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
|
||||||
log_holder.text,
|
|
||||||
)
|
|
||||||
assert len(log_matches) == 2, log_holder.text
|
|
||||||
|
|
||||||
assert int(log_matches[0]) == matches.attention_fusion
|
|
||||||
assert int(log_matches[1]) == matches.attention_fusion
|
|
||||||
|
|
||||||
log_matches = re.findall(
|
|
||||||
r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
|
|
||||||
log_holder.text,
|
|
||||||
)
|
|
||||||
assert len(log_matches) == 2, log_holder.text
|
|
||||||
|
|
||||||
assert int(log_matches[0]) == matches.sequence_parallel
|
|
||||||
assert int(log_matches[1]) == matches.sequence_parallel
|
|
||||||
|
|
||||||
log_matches = re.findall(
|
|
||||||
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
|
|
||||||
log_holder.text,
|
|
||||||
)
|
|
||||||
assert len(log_matches) == 2, log_holder.text
|
|
||||||
|
|
||||||
assert int(log_matches[0]) == matches.async_tp
|
|
||||||
assert int(log_matches[1]) == matches.async_tp
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
|
||||||
# Test rms norm+group quant_fp8 fusion
|
|
||||||
list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
|
||||||
# TODO: remove skip after we fix the fusion thoroughly
|
|
||||||
@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell")
|
|
||||||
def test_rms_group_quant(
|
|
||||||
model_name: str,
|
|
||||||
model_kwargs: dict[str, Any],
|
|
||||||
backend: AttentionBackendEnum,
|
|
||||||
matches: Matches,
|
|
||||||
custom_ops: str,
|
|
||||||
inductor_graph_partition: bool,
|
|
||||||
caplog_mp_spawn,
|
|
||||||
monkeypatch,
|
|
||||||
):
|
|
||||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
|
||||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
|
||||||
|
|
||||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
|
||||||
|
|
||||||
if inductor_graph_partition:
|
|
||||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
|
||||||
splitting_ops: list[str] | None = None
|
|
||||||
else:
|
|
||||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
|
||||||
splitting_ops = []
|
|
||||||
|
|
||||||
# Disable, compile cache to make sure custom passes run.
|
|
||||||
# Otherwise, we can't verify fusion happened through the logs.
|
|
||||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
|
||||||
|
|
||||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
|
||||||
# Force spawn as it is more general.
|
|
||||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
|
||||||
|
|
||||||
# TODO: remove this after fusion is fixed
|
|
||||||
monkeypatch.setenv("VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES", "0")
|
|
||||||
|
|
||||||
model_kwargs["attention_config"] = {"backend": backend.name}
|
|
||||||
|
|
||||||
compilation_config = CompilationConfig(
|
|
||||||
# Testing properties
|
|
||||||
custom_ops=custom_ops_list,
|
|
||||||
use_inductor_graph_partition=inductor_graph_partition,
|
|
||||||
cudagraph_mode=mode,
|
|
||||||
splitting_ops=splitting_ops,
|
|
||||||
# Common
|
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
|
||||||
pass_config=PassConfig(
|
|
||||||
fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
|
|
||||||
),
|
|
||||||
# Inductor caches custom passes by default as well via uuid
|
|
||||||
inductor_compile_config={"force_disable_caches": True},
|
|
||||||
)
|
|
||||||
|
|
||||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
|
||||||
run_model(compilation_config, model_name, **model_kwargs)
|
|
||||||
|
|
||||||
log_matches = re.findall(
|
|
||||||
r"\[fusion.py:\d+] Replaced (\d+) patterns",
|
|
||||||
log_holder.text,
|
|
||||||
)
|
|
||||||
assert len(log_matches) == 1, log_holder.text
|
|
||||||
assert int(log_matches[0]) == matches.rms_quant_norm_fusion
|
|
||||||
@@ -1,208 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""Shared utilities for fusion tests (e.g. test_fusion_attn.py)."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
from collections.abc import Iterable
|
|
||||||
from typing import Any, NamedTuple
|
|
||||||
|
|
||||||
from tests.v1.attention.utils import AttentionBackendEnum
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
from vllm.config import CompilationConfig, CUDAGraphMode
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
|
|
||||||
is_blackwell = lambda: current_platform.is_device_capability_family(100)
|
|
||||||
"""Are we running on Blackwell, a lot of tests depend on it"""
|
|
||||||
|
|
||||||
|
|
||||||
def has_cuda_graph_wrapper_metadata() -> bool:
|
|
||||||
from importlib import import_module
|
|
||||||
|
|
||||||
try:
|
|
||||||
module = import_module("torch._inductor.utils")
|
|
||||||
module.CUDAGraphWrapperMetadata # noqa B018
|
|
||||||
except AttributeError:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
class Matches(NamedTuple):
|
|
||||||
attention_fusion: int = 0
|
|
||||||
allreduce_fusion: int = 0
|
|
||||||
sequence_parallel: int = 0
|
|
||||||
async_tp: int = 0
|
|
||||||
rms_quant_norm_fusion: int = 0
|
|
||||||
|
|
||||||
|
|
||||||
class ModelBackendTestCase(NamedTuple):
|
|
||||||
model_name: str
|
|
||||||
model_kwargs: dict[str, Any]
|
|
||||||
backend: AttentionBackendEnum
|
|
||||||
matches: Matches
|
|
||||||
|
|
||||||
|
|
||||||
# E2E model test cases
|
|
||||||
MODELS_FP8: list[ModelBackendTestCase] = []
|
|
||||||
MODELS_FP4: list[ModelBackendTestCase] = []
|
|
||||||
MODELS: list[ModelBackendTestCase] = [] # tp-only (unquantized)
|
|
||||||
MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
|
|
||||||
|
|
||||||
if current_platform.is_cuda():
|
|
||||||
MODELS_FP8 = [
|
|
||||||
ModelBackendTestCase(
|
|
||||||
# Use smaller model for L40s in CI
|
|
||||||
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
|
||||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
|
||||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
|
||||||
matches=Matches(
|
|
||||||
attention_fusion=32,
|
|
||||||
allreduce_fusion=65,
|
|
||||||
sequence_parallel=65,
|
|
||||||
async_tp=128,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
ModelBackendTestCase(
|
|
||||||
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
|
|
||||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
|
||||||
# TODO FlashInfer attn broken on Hopper with kvcache=fp8:
|
|
||||||
# https://github.com/vllm-project/vllm/issues/28568
|
|
||||||
backend=AttentionBackendEnum.FLASHINFER
|
|
||||||
if is_blackwell()
|
|
||||||
else AttentionBackendEnum.TRITON_ATTN,
|
|
||||||
matches=Matches(
|
|
||||||
attention_fusion=48,
|
|
||||||
allreduce_fusion=96,
|
|
||||||
sequence_parallel=96,
|
|
||||||
async_tp=95, # mlp is moe, no fusion there
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
MODELS_FP4 = [
|
|
||||||
ModelBackendTestCase(
|
|
||||||
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
|
|
||||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
|
||||||
backend=AttentionBackendEnum.FLASHINFER,
|
|
||||||
matches=Matches(
|
|
||||||
attention_fusion=32,
|
|
||||||
allreduce_fusion=65,
|
|
||||||
sequence_parallel=65,
|
|
||||||
async_tp=128,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
# TP only (unquantized models)
|
|
||||||
MODELS = [
|
|
||||||
ModelBackendTestCase(
|
|
||||||
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
model_kwargs=dict(max_model_len=1024),
|
|
||||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
|
||||||
matches=Matches(
|
|
||||||
attention_fusion=0,
|
|
||||||
allreduce_fusion=65,
|
|
||||||
sequence_parallel=65,
|
|
||||||
async_tp=128,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
ModelBackendTestCase(
|
|
||||||
model_name="Qwen/Qwen3-30B-A3B",
|
|
||||||
model_kwargs=dict(max_model_len=1024),
|
|
||||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
|
||||||
matches=Matches(
|
|
||||||
attention_fusion=0,
|
|
||||||
allreduce_fusion=97,
|
|
||||||
sequence_parallel=97,
|
|
||||||
async_tp=96, # MLP is MoE, half the fusions of dense
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
MODELS_GROUP_FP8 = [
|
|
||||||
ModelBackendTestCase(
|
|
||||||
model_name="Qwen/Qwen3-30B-A3B-FP8",
|
|
||||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
|
||||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
|
||||||
matches=Matches(
|
|
||||||
rms_quant_norm_fusion=48,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
elif current_platform.is_rocm():
|
|
||||||
MODELS_FP8 = [
|
|
||||||
ModelBackendTestCase(
|
|
||||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
|
||||||
model_kwargs=dict(max_model_len=1024),
|
|
||||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
|
||||||
matches=Matches(attention_fusion=32),
|
|
||||||
),
|
|
||||||
ModelBackendTestCase(
|
|
||||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
|
||||||
model_kwargs=dict(max_model_len=1024),
|
|
||||||
backend=AttentionBackendEnum.ROCM_ATTN,
|
|
||||||
matches=Matches(attention_fusion=32),
|
|
||||||
),
|
|
||||||
ModelBackendTestCase(
|
|
||||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
|
||||||
model_kwargs=dict(max_model_len=1024),
|
|
||||||
backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
|
|
||||||
matches=Matches(attention_fusion=32),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# Custom ops toggle lists for parametrization
|
|
||||||
CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
|
|
||||||
CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
|
|
||||||
CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
|
|
||||||
|
|
||||||
|
|
||||||
def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
|
|
||||||
"""Generate all combinations of custom ops for parametrization."""
|
|
||||||
for op_list in itertools.product(*custom_ops_lists):
|
|
||||||
yield ",".join(op_list)
|
|
||||||
|
|
||||||
|
|
||||||
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
|
|
||||||
"""Run a model with the given compilation config for E2E fusion tests."""
|
|
||||||
compilation_config = (
|
|
||||||
compile_config
|
|
||||||
if isinstance(compile_config, CompilationConfig)
|
|
||||||
else CompilationConfig(mode=compile_config)
|
|
||||||
)
|
|
||||||
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
sampling_params = SamplingParams(temperature=0)
|
|
||||||
# Allow override from model_kwargs
|
|
||||||
model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
|
|
||||||
model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
|
|
||||||
|
|
||||||
# No cudagraphs by default
|
|
||||||
if compilation_config.cudagraph_mode is None:
|
|
||||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
|
||||||
llm = LLM(
|
|
||||||
model=model,
|
|
||||||
compilation_config=compilation_config,
|
|
||||||
**model_kwargs,
|
|
||||||
)
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
|
|
||||||
# Print the outputs.
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
||||||
|
|
||||||
# Get the compile ranges split points after vllm config post init
|
|
||||||
# in order to compute compile ranges correctly
|
|
||||||
compilation_config.compile_ranges_split_points = (
|
|
||||||
llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
|
|
||||||
)
|
|
||||||
0
tests/compile/fusions_e2e/__init__.py
Normal file
0
tests/compile/fusions_e2e/__init__.py
Normal file
102
tests/compile/fusions_e2e/common.py
Normal file
102
tests/compile/fusions_e2e/common.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import itertools
|
||||||
|
from collections.abc import Callable, Iterable
|
||||||
|
from typing import Any, NamedTuple
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import regex as re
|
||||||
|
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||||
|
|
||||||
|
|
||||||
|
class Matches(NamedTuple):
|
||||||
|
# simple pointwise
|
||||||
|
rms_quant_fusion: int = 0
|
||||||
|
act_quant_fusion: int = 0
|
||||||
|
norm_rope_fusion: int = 0
|
||||||
|
attn_quant_fusion: int = 0
|
||||||
|
# distributed
|
||||||
|
ar_rms_fusion: int = 0
|
||||||
|
sequence_parallel: int = 0
|
||||||
|
async_tp: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
class ModelFusionInfo(NamedTuple):
|
||||||
|
model_name: str
|
||||||
|
matches: Callable[[int], Matches]
|
||||||
|
"""Given number of hidden layers, produces the matches object"""
|
||||||
|
model_kwargs: dict[str, Any] = {}
|
||||||
|
hf_overrides: Callable[[int], dict] = lambda n: {"num_hidden_layers": n}
|
||||||
|
|
||||||
|
|
||||||
|
class AttentionBackendCase(NamedTuple):
|
||||||
|
backend: AttentionBackendEnum
|
||||||
|
model_kwargs: dict[str, Any] = {}
|
||||||
|
"""Additional args required for attn+quant fusion"""
|
||||||
|
|
||||||
|
|
||||||
|
is_blackwell = lambda: current_platform.is_device_capability_family(100)
|
||||||
|
"""Are we running on Blackwell, a lot of tests depend on it"""
|
||||||
|
|
||||||
|
|
||||||
|
def custom_ops_combos(*custom_ops: str) -> Iterable[str]:
|
||||||
|
"""Generate all combinations of custom ops for parametrization."""
|
||||||
|
custom_ops_lists = [[f"-{op}", f"+{op}"] for op in custom_ops]
|
||||||
|
for op_list in itertools.product(*custom_ops_lists):
|
||||||
|
yield ",".join(op_list)
|
||||||
|
|
||||||
|
|
||||||
|
# Quick inline validation
|
||||||
|
assert list(custom_ops_combos("silu_and_mul")) == ["-silu_and_mul", "+silu_and_mul"]
|
||||||
|
assert list(custom_ops_combos("quant_fp8", "rms_norm")) == [
|
||||||
|
"-quant_fp8,-rms_norm",
|
||||||
|
"-quant_fp8,+rms_norm",
|
||||||
|
"+quant_fp8,-rms_norm",
|
||||||
|
"+quant_fp8,+rms_norm",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def has_cuda_graph_wrapper_metadata() -> bool:
|
||||||
|
from importlib import import_module
|
||||||
|
|
||||||
|
try:
|
||||||
|
module = import_module("torch._inductor.utils")
|
||||||
|
module.CUDAGraphWrapperMetadata # noqa B018
|
||||||
|
except AttributeError:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
INDUCTOR_GRAPH_PARTITION = [
|
||||||
|
pytest.param(
|
||||||
|
True,
|
||||||
|
marks=pytest.mark.skipif(
|
||||||
|
not has_cuda_graph_wrapper_metadata(),
|
||||||
|
reason="torch version does not support Inductor partition",
|
||||||
|
),
|
||||||
|
id="inductor_partition",
|
||||||
|
),
|
||||||
|
pytest.param(False, id="dynamo_partition"),
|
||||||
|
]
|
||||||
|
|
||||||
|
FUSION_LOG_PATTERNS: dict[str, re.Pattern] = {
|
||||||
|
"rms_quant_fusion": re.compile(
|
||||||
|
r"\[(?:compilation/)?fusion.py:\d+] Replaced (\d+) patterns"
|
||||||
|
),
|
||||||
|
"act_quant_fusion": re.compile(
|
||||||
|
r"activation_quant_fusion.py:\d+] Replaced (\d+) patterns"
|
||||||
|
),
|
||||||
|
"norm_rope_fusion": re.compile(
|
||||||
|
r"qk_norm_rope_fusion.py:\d+] Fused QK Norm\+RoPE on (\d+) sites"
|
||||||
|
),
|
||||||
|
"attn_quant_fusion": re.compile(
|
||||||
|
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes"
|
||||||
|
),
|
||||||
|
"ar_rms_fusion": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
|
||||||
|
"sequence_parallel": re.compile(
|
||||||
|
r"sequence_parallelism.py:\d+] Replaced (\d+) patterns"
|
||||||
|
),
|
||||||
|
"async_tp": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
|
||||||
|
}
|
||||||
158
tests/compile/fusions_e2e/conftest.py
Normal file
158
tests/compile/fusions_e2e/conftest.py
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import regex as re
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
|
||||||
|
|
||||||
|
from .common import FUSION_LOG_PATTERNS, AttentionBackendCase, Matches
|
||||||
|
|
||||||
|
|
||||||
|
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
|
||||||
|
"""Run a model with the given compilation config for E2E fusion tests."""
|
||||||
|
compilation_config = (
|
||||||
|
compile_config
|
||||||
|
if isinstance(compile_config, CompilationConfig)
|
||||||
|
else CompilationConfig(mode=compile_config)
|
||||||
|
)
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
sampling_params = SamplingParams(temperature=0)
|
||||||
|
# Allow override from model_kwargs
|
||||||
|
model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
|
||||||
|
model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
|
||||||
|
|
||||||
|
# No cudagraphs by default
|
||||||
|
if compilation_config.cudagraph_mode is None:
|
||||||
|
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||||
|
llm = LLM(
|
||||||
|
model=model,
|
||||||
|
compilation_config=compilation_config,
|
||||||
|
**model_kwargs,
|
||||||
|
)
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
|
||||||
|
# Get the compile ranges split points after vllm config post init
|
||||||
|
# in order to compute compile ranges correctly
|
||||||
|
compilation_config.compile_ranges_split_points = (
|
||||||
|
llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def run_e2e_fusion_test(monkeypatch, caplog_mp_spawn):
|
||||||
|
def run(
|
||||||
|
model_name: str,
|
||||||
|
matches: Matches,
|
||||||
|
model_kwargs: dict,
|
||||||
|
attn_backend: AttentionBackendCase,
|
||||||
|
compilation_config: dict,
|
||||||
|
matches_check: list[str],
|
||||||
|
use_deepgemm: bool = False,
|
||||||
|
tp_size: int = 1,
|
||||||
|
):
|
||||||
|
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1" if use_deepgemm else "0")
|
||||||
|
|
||||||
|
# Disable, compile cache to make sure custom passes run.
|
||||||
|
# Otherwise, we can't verify fusion happened through the logs.
|
||||||
|
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||||
|
|
||||||
|
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
||||||
|
# Force spawn as it is more general.
|
||||||
|
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||||
|
|
||||||
|
model_kwargs = {**attn_backend.model_kwargs, **model_kwargs}
|
||||||
|
model_kwargs["attention_config"] = {"backend": attn_backend.backend.name}
|
||||||
|
model_kwargs["tensor_parallel_size"] = tp_size
|
||||||
|
|
||||||
|
# Always compile the full graph instead of piecewise
|
||||||
|
if not compilation_config["use_inductor_graph_partition"]:
|
||||||
|
compilation_config["splitting_ops"] = []
|
||||||
|
|
||||||
|
full_compilation_config = CompilationConfig(
|
||||||
|
cudagraph_mode=CUDAGraphMode.NONE,
|
||||||
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
|
inductor_compile_config={"force_disable_caches": True},
|
||||||
|
**compilation_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||||
|
run_model(full_compilation_config, model_name, **model_kwargs)
|
||||||
|
|
||||||
|
num_compile_ranges = len(full_compilation_config.get_compile_ranges())
|
||||||
|
assert num_compile_ranges in [1, 2]
|
||||||
|
|
||||||
|
print(f"Compile ranges: {full_compilation_config.get_compile_ranges()}")
|
||||||
|
print("Fusion results:")
|
||||||
|
|
||||||
|
# Iterate through all so printing happens before asserting
|
||||||
|
log_matches_dict = {}
|
||||||
|
for match_name, pattern in FUSION_LOG_PATTERNS.items():
|
||||||
|
log_matches_dict[match_name] = list(pattern.findall(log_holder.text))
|
||||||
|
print(f"- {match_name}={','.join(log_matches_dict[match_name])}")
|
||||||
|
|
||||||
|
# Now check the matches
|
||||||
|
for match_name in matches_check:
|
||||||
|
num_ranges_activated = (
|
||||||
|
1 if match_name == "ar_rms_fusion" else num_compile_ranges
|
||||||
|
)
|
||||||
|
n_expected = tp_size * num_ranges_activated
|
||||||
|
|
||||||
|
log_matches = list(int(ms) for ms in log_matches_dict[match_name])
|
||||||
|
assert len(log_matches) == n_expected, (
|
||||||
|
f"Could not find {n_expected} {match_name} "
|
||||||
|
f"(found {len(log_matches)}) in:\n {log_holder.text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
expected_matches = getattr(matches, match_name)
|
||||||
|
|
||||||
|
if match_name == "rms_quant_fusion" and "ar_rms_fusion" in matches_check:
|
||||||
|
# AR+rms+quant takes precedence over rms+quant if activated.
|
||||||
|
# That means we get full matching where ar+rms+quant was not activated,
|
||||||
|
# and less where it was
|
||||||
|
assert sum(m == expected_matches for m in log_matches) == tp_size * (
|
||||||
|
num_ranges_activated - 1
|
||||||
|
), "Expecting full rms+quant fusion where ar+rms+quant not activated"
|
||||||
|
|
||||||
|
assert all(
|
||||||
|
expected_matches - matches.ar_rms_fusion <= m <= expected_matches
|
||||||
|
for m in log_matches
|
||||||
|
), (
|
||||||
|
f"Expecting at least {expected_matches - matches.ar_rms_fusion} "
|
||||||
|
f"where ar+rms+quant was activated"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
expected_matches_list = [expected_matches] * n_expected
|
||||||
|
assert sorted(log_matches) == expected_matches_list, (
|
||||||
|
f"{match_name} expected: {expected_matches_list}, "
|
||||||
|
f"found: {sorted(log_matches)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if match_name == "ar_rms_fusion":
|
||||||
|
log_matches = re.findall(
|
||||||
|
r"pass_manager.py:\d+] Skipping "
|
||||||
|
r".*AllReduceFusionPass.* with compile range",
|
||||||
|
log_holder.text,
|
||||||
|
)
|
||||||
|
|
||||||
|
n_expected = tp_size * (num_compile_ranges - num_ranges_activated)
|
||||||
|
assert len(log_matches) == n_expected, (
|
||||||
|
f'Could not find {n_expected} "Skipping AllReduceFusionPass" '
|
||||||
|
f"(found {len(log_matches)}) in:\n {log_holder.text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
112
tests/compile/fusions_e2e/models.py
Normal file
112
tests/compile/fusions_e2e/models.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.utils.flashinfer import has_flashinfer
|
||||||
|
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||||
|
|
||||||
|
from .common import AttentionBackendCase, Matches, ModelFusionInfo, is_blackwell
|
||||||
|
|
||||||
|
# Attn backends
|
||||||
|
FLASHINFER_ATTN = pytest.param(
|
||||||
|
AttentionBackendCase(
|
||||||
|
backend=AttentionBackendEnum.FLASHINFER,
|
||||||
|
model_kwargs=dict(kv_cache_dtype="fp8"),
|
||||||
|
),
|
||||||
|
id="FLASHINFER",
|
||||||
|
marks=pytest.mark.skipif(
|
||||||
|
not is_blackwell() or not has_flashinfer(),
|
||||||
|
reason="FI backend requires Blackwell and FlashInfer",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
TRITON_ATTN = pytest.param(
|
||||||
|
AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Models
|
||||||
|
llama3_8b = ModelFusionInfo(
|
||||||
|
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
matches=lambda n_layers: Matches(
|
||||||
|
ar_rms_fusion=n_layers * 2 + 1,
|
||||||
|
sequence_parallel=n_layers * 2 + 1,
|
||||||
|
async_tp=n_layers * 4,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
llama3_8b_fp8 = ModelFusionInfo(
|
||||||
|
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||||
|
matches=lambda n_layers: Matches(
|
||||||
|
rms_quant_fusion=n_layers * 2,
|
||||||
|
act_quant_fusion=n_layers,
|
||||||
|
attn_quant_fusion=n_layers,
|
||||||
|
ar_rms_fusion=n_layers * 2 + 1,
|
||||||
|
sequence_parallel=n_layers * 2 + 1,
|
||||||
|
async_tp=n_layers * 4,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
llama3_8b_fp4 = ModelFusionInfo(
|
||||||
|
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
|
||||||
|
matches=lambda n_layers: Matches(
|
||||||
|
rms_quant_fusion=0,
|
||||||
|
act_quant_fusion=n_layers,
|
||||||
|
attn_quant_fusion=n_layers,
|
||||||
|
ar_rms_fusion=n_layers * 2 + 1,
|
||||||
|
sequence_parallel=n_layers * 2 + 1,
|
||||||
|
async_tp=n_layers * 4,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# MoEs cannot do act+quant fusion because those ops are hidden from torch.compile.
|
||||||
|
# MoEs also only expose 1 rms+quant fusion because the quant for up_proj is hidden.
|
||||||
|
# TODO(luka): https://github.com/vllm-project/vllm/issues/31985
|
||||||
|
# Also, for MoEs, gemm+collective fusion only happens for dense GEMMs (o_proj/qkv proj)
|
||||||
|
|
||||||
|
llama4_scout_fp8 = ModelFusionInfo(
|
||||||
|
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
|
||||||
|
hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
|
||||||
|
matches=lambda n_layers: Matches(
|
||||||
|
rms_quant_fusion=n_layers,
|
||||||
|
attn_quant_fusion=n_layers,
|
||||||
|
ar_rms_fusion=n_layers * 2,
|
||||||
|
sequence_parallel=n_layers * 2,
|
||||||
|
async_tp=n_layers * 2 - 1,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
llama4_scout_fp4 = ModelFusionInfo(
|
||||||
|
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4",
|
||||||
|
hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
|
||||||
|
matches=lambda n_layers: Matches(
|
||||||
|
rms_quant_fusion=0,
|
||||||
|
attn_quant_fusion=n_layers,
|
||||||
|
ar_rms_fusion=n_layers * 2,
|
||||||
|
sequence_parallel=n_layers * 2,
|
||||||
|
async_tp=n_layers * 2 - 1,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
qwen3_a3b = ModelFusionInfo(
|
||||||
|
model_name="Qwen/Qwen3-30B-A3B",
|
||||||
|
matches=lambda n_layers: Matches(
|
||||||
|
norm_rope_fusion=n_layers,
|
||||||
|
ar_rms_fusion=n_layers * 2 + 1,
|
||||||
|
sequence_parallel=n_layers * 2 + 1,
|
||||||
|
async_tp=n_layers * 2,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
qwen3_a3b_fp8 = ModelFusionInfo(
|
||||||
|
model_name="Qwen/Qwen3-30B-A3B-FP8",
|
||||||
|
matches=lambda n_layers: Matches(
|
||||||
|
rms_quant_fusion=n_layers,
|
||||||
|
# TODO broken on Blackwell:
|
||||||
|
# https://github.com/vllm-project/vllm/issues/33295
|
||||||
|
norm_rope_fusion=0 if is_blackwell() else n_layers,
|
||||||
|
attn_quant_fusion=0, # attn + group quant not supported
|
||||||
|
ar_rms_fusion=n_layers * 2 + 1,
|
||||||
|
sequence_parallel=n_layers * 2 + 1,
|
||||||
|
async_tp=n_layers * 2,
|
||||||
|
),
|
||||||
|
)
|
||||||
146
tests/compile/fusions_e2e/test_tp1_quant.py
Normal file
146
tests/compile/fusions_e2e/test_tp1_quant.py
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from collections.abc import Callable
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config import PassConfig
|
||||||
|
|
||||||
|
from .common import (
|
||||||
|
INDUCTOR_GRAPH_PARTITION,
|
||||||
|
AttentionBackendCase,
|
||||||
|
Matches,
|
||||||
|
custom_ops_combos,
|
||||||
|
is_blackwell,
|
||||||
|
)
|
||||||
|
from .models import (
|
||||||
|
FLASHINFER_ATTN,
|
||||||
|
TRITON_ATTN,
|
||||||
|
llama3_8b_fp4,
|
||||||
|
llama3_8b_fp8,
|
||||||
|
llama4_scout_fp4,
|
||||||
|
llama4_scout_fp8,
|
||||||
|
qwen3_a3b_fp8,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm",
|
||||||
|
[
|
||||||
|
(*llama3_8b_fp8, False),
|
||||||
|
(*llama4_scout_fp8, False),
|
||||||
|
(*qwen3_a3b_fp8, False),
|
||||||
|
(*qwen3_a3b_fp8, True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
|
||||||
|
@pytest.mark.parametrize("n_layers", [6])
|
||||||
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
|
||||||
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||||
|
def test_tp1_fp8_fusions(
|
||||||
|
model_name: str,
|
||||||
|
matches_fn: Callable[[int], Matches],
|
||||||
|
model_kwargs: dict,
|
||||||
|
hf_overrides: Callable[[int], dict],
|
||||||
|
attn_backend: AttentionBackendCase,
|
||||||
|
n_layers: int,
|
||||||
|
custom_ops: str,
|
||||||
|
inductor_graph_partition: bool,
|
||||||
|
use_deepgemm: bool,
|
||||||
|
run_e2e_fusion_test,
|
||||||
|
monkeypatch,
|
||||||
|
):
|
||||||
|
if use_deepgemm:
|
||||||
|
# TODO(luka/eliza) DeepGEMM uses different quants, matching not supported
|
||||||
|
# - on Blackwell, uses a special quant fp8, currently not supported
|
||||||
|
# - on Hopper, tma-aligned scales inhibit matching (fix WIP)
|
||||||
|
pytest.skip("DeepGEMM & quant matching not currently supported")
|
||||||
|
|
||||||
|
matches = matches_fn(n_layers)
|
||||||
|
|
||||||
|
if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops:
|
||||||
|
# This is why config forces +quant_fp8 by default
|
||||||
|
pytest.skip("native QuantFP8 matching not supported for group quant")
|
||||||
|
|
||||||
|
# Reduce size of model and skip weight loading time
|
||||||
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||||
|
model_kwargs["load_format"] = "dummy"
|
||||||
|
model_kwargs["max_model_len"] = 1024
|
||||||
|
|
||||||
|
compilation_config = dict(
|
||||||
|
use_inductor_graph_partition=inductor_graph_partition,
|
||||||
|
custom_ops=custom_ops.split(","),
|
||||||
|
pass_config=PassConfig(
|
||||||
|
fuse_norm_quant=True,
|
||||||
|
fuse_act_quant=True,
|
||||||
|
fuse_attn_quant=True,
|
||||||
|
enable_qk_norm_rope_fusion=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
matches_check = [
|
||||||
|
"rms_quant_fusion",
|
||||||
|
"act_quant_fusion",
|
||||||
|
"norm_rope_fusion",
|
||||||
|
"attn_quant_fusion",
|
||||||
|
]
|
||||||
|
|
||||||
|
run_e2e_fusion_test(
|
||||||
|
model_name,
|
||||||
|
matches,
|
||||||
|
model_kwargs,
|
||||||
|
attn_backend,
|
||||||
|
compilation_config,
|
||||||
|
matches_check,
|
||||||
|
use_deepgemm=use_deepgemm,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||||
|
[llama3_8b_fp4, llama4_scout_fp4],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
|
||||||
|
@pytest.mark.parametrize("n_layers", [6])
|
||||||
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
||||||
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||||
|
@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
|
||||||
|
def test_tp1_fp4_fusions(
|
||||||
|
model_name: str,
|
||||||
|
matches_fn: Callable[[int], Matches],
|
||||||
|
model_kwargs: dict,
|
||||||
|
hf_overrides: Callable[[int], dict],
|
||||||
|
attn_backend: AttentionBackendCase,
|
||||||
|
n_layers: int,
|
||||||
|
custom_ops: str,
|
||||||
|
inductor_graph_partition: bool,
|
||||||
|
run_e2e_fusion_test,
|
||||||
|
):
|
||||||
|
matches = matches_fn(n_layers)
|
||||||
|
|
||||||
|
# Reduce size of model and skip weight loading time
|
||||||
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||||
|
model_kwargs["load_format"] = "dummy"
|
||||||
|
model_kwargs["max_model_len"] = 1024
|
||||||
|
|
||||||
|
compilation_config = dict(
|
||||||
|
use_inductor_graph_partition=inductor_graph_partition,
|
||||||
|
custom_ops=custom_ops.split(","),
|
||||||
|
pass_config=PassConfig(
|
||||||
|
fuse_norm_quant=True,
|
||||||
|
fuse_act_quant=True,
|
||||||
|
fuse_attn_quant=True,
|
||||||
|
enable_qk_norm_rope_fusion=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
matches_check = ["act_quant_fusion", "attn_quant_fusion", "norm_rope_fusion"]
|
||||||
|
|
||||||
|
run_e2e_fusion_test(
|
||||||
|
model_name,
|
||||||
|
matches,
|
||||||
|
model_kwargs,
|
||||||
|
attn_backend,
|
||||||
|
compilation_config,
|
||||||
|
matches_check,
|
||||||
|
)
|
||||||
199
tests/compile/fusions_e2e/test_tp2_ar_rms.py
Normal file
199
tests/compile/fusions_e2e/test_tp2_ar_rms.py
Normal file
@@ -0,0 +1,199 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from collections.abc import Callable
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config import PassConfig
|
||||||
|
|
||||||
|
from ...utils import multi_gpu_test
|
||||||
|
from .common import (
|
||||||
|
INDUCTOR_GRAPH_PARTITION,
|
||||||
|
AttentionBackendCase,
|
||||||
|
Matches,
|
||||||
|
custom_ops_combos,
|
||||||
|
is_blackwell,
|
||||||
|
)
|
||||||
|
from .models import (
|
||||||
|
FLASHINFER_ATTN,
|
||||||
|
TRITON_ATTN,
|
||||||
|
llama3_8b,
|
||||||
|
llama3_8b_fp4,
|
||||||
|
llama3_8b_fp8,
|
||||||
|
llama4_scout_fp4,
|
||||||
|
llama4_scout_fp8,
|
||||||
|
qwen3_a3b,
|
||||||
|
qwen3_a3b_fp8,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||||
|
# qwen3-fp8 should still fuse AR+rms even though group quant is not yet supported
|
||||||
|
[llama3_8b_fp8, llama4_scout_fp8, qwen3_a3b_fp8],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
|
||||||
|
@pytest.mark.parametrize("n_layers", [4])
|
||||||
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
|
||||||
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||||
|
def test_tp2_ar_rms_fp8_fusions(
|
||||||
|
model_name: str,
|
||||||
|
matches_fn: Callable[[int], Matches],
|
||||||
|
model_kwargs: dict,
|
||||||
|
hf_overrides: Callable[[int], dict],
|
||||||
|
attn_backend: AttentionBackendCase,
|
||||||
|
n_layers: int,
|
||||||
|
custom_ops: str,
|
||||||
|
inductor_graph_partition: bool,
|
||||||
|
run_e2e_fusion_test,
|
||||||
|
monkeypatch,
|
||||||
|
):
|
||||||
|
matches = matches_fn(n_layers)
|
||||||
|
|
||||||
|
if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops:
|
||||||
|
# This is why config forces +quant_fp8 by default
|
||||||
|
pytest.skip("native QuantFP8 matching not supported for group quant")
|
||||||
|
|
||||||
|
# Reduce size of model and skip weight loading time
|
||||||
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||||
|
model_kwargs["load_format"] = "dummy"
|
||||||
|
model_kwargs["max_model_len"] = 1024
|
||||||
|
|
||||||
|
compilation_config = dict(
|
||||||
|
use_inductor_graph_partition=inductor_graph_partition,
|
||||||
|
custom_ops=custom_ops.split(","),
|
||||||
|
pass_config=PassConfig(
|
||||||
|
fuse_norm_quant=True,
|
||||||
|
fuse_act_quant=True,
|
||||||
|
fuse_attn_quant=True,
|
||||||
|
enable_qk_norm_rope_fusion=True,
|
||||||
|
fuse_allreduce_rms=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
matches_check = [
|
||||||
|
"rms_quant_fusion",
|
||||||
|
"act_quant_fusion",
|
||||||
|
"norm_rope_fusion",
|
||||||
|
"attn_quant_fusion",
|
||||||
|
"ar_rms_fusion",
|
||||||
|
]
|
||||||
|
|
||||||
|
run_e2e_fusion_test(
|
||||||
|
model_name,
|
||||||
|
matches,
|
||||||
|
model_kwargs,
|
||||||
|
attn_backend,
|
||||||
|
compilation_config,
|
||||||
|
matches_check,
|
||||||
|
tp_size=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||||
|
[llama3_8b_fp4, llama4_scout_fp4],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
|
||||||
|
@pytest.mark.parametrize("n_layers", [4])
|
||||||
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
||||||
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||||
|
@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
|
||||||
|
def test_tp2_ar_rms_fp4_fusions(
|
||||||
|
model_name: str,
|
||||||
|
matches_fn: Callable[[int], Matches],
|
||||||
|
model_kwargs: dict,
|
||||||
|
hf_overrides: Callable[[int], dict],
|
||||||
|
attn_backend: AttentionBackendCase,
|
||||||
|
n_layers: int,
|
||||||
|
custom_ops: str,
|
||||||
|
inductor_graph_partition: bool,
|
||||||
|
run_e2e_fusion_test,
|
||||||
|
monkeypatch,
|
||||||
|
):
|
||||||
|
matches = matches_fn(n_layers)
|
||||||
|
|
||||||
|
# Reduce size of model and skip weight loading time
|
||||||
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||||
|
model_kwargs["load_format"] = "dummy"
|
||||||
|
model_kwargs["max_model_len"] = 1024
|
||||||
|
|
||||||
|
compilation_config = dict(
|
||||||
|
use_inductor_graph_partition=inductor_graph_partition,
|
||||||
|
custom_ops=custom_ops.split(","),
|
||||||
|
pass_config=PassConfig(
|
||||||
|
fuse_act_quant=True,
|
||||||
|
fuse_attn_quant=True,
|
||||||
|
fuse_allreduce_rms=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
matches_check = [
|
||||||
|
"act_quant_fusion",
|
||||||
|
"attn_quant_fusion",
|
||||||
|
"ar_rms_fusion",
|
||||||
|
]
|
||||||
|
|
||||||
|
run_e2e_fusion_test(
|
||||||
|
model_name,
|
||||||
|
matches,
|
||||||
|
model_kwargs,
|
||||||
|
attn_backend,
|
||||||
|
compilation_config,
|
||||||
|
matches_check,
|
||||||
|
tp_size=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||||
|
[llama3_8b, qwen3_a3b],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
|
||||||
|
@pytest.mark.parametrize("n_layers", [4])
|
||||||
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
||||||
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||||
|
def test_tp2_ar_rms_fusions(
|
||||||
|
model_name: str,
|
||||||
|
matches_fn: Callable[[int], Matches],
|
||||||
|
model_kwargs: dict,
|
||||||
|
hf_overrides: Callable[[int], dict],
|
||||||
|
attn_backend: AttentionBackendCase,
|
||||||
|
n_layers: int,
|
||||||
|
custom_ops: str,
|
||||||
|
inductor_graph_partition: bool,
|
||||||
|
run_e2e_fusion_test,
|
||||||
|
):
|
||||||
|
matches = matches_fn(n_layers)
|
||||||
|
|
||||||
|
# Reduce size of model and skip weight loading time
|
||||||
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||||
|
model_kwargs["load_format"] = "dummy"
|
||||||
|
model_kwargs["max_model_len"] = 1024
|
||||||
|
|
||||||
|
compilation_config = dict(
|
||||||
|
use_inductor_graph_partition=inductor_graph_partition,
|
||||||
|
custom_ops=custom_ops.split(","),
|
||||||
|
pass_config=PassConfig(
|
||||||
|
enable_qk_norm_rope_fusion=True,
|
||||||
|
fuse_allreduce_rms=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
matches_check = [
|
||||||
|
"norm_rope_fusion",
|
||||||
|
"ar_rms_fusion",
|
||||||
|
]
|
||||||
|
|
||||||
|
run_e2e_fusion_test(
|
||||||
|
model_name,
|
||||||
|
matches,
|
||||||
|
model_kwargs,
|
||||||
|
attn_backend,
|
||||||
|
compilation_config,
|
||||||
|
matches_check,
|
||||||
|
tp_size=2,
|
||||||
|
)
|
||||||
143
tests/compile/fusions_e2e/test_tp2_async_tp.py
Normal file
143
tests/compile/fusions_e2e/test_tp2_async_tp.py
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from collections.abc import Callable
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config import PassConfig
|
||||||
|
|
||||||
|
from ...utils import multi_gpu_test
|
||||||
|
from .common import (
|
||||||
|
INDUCTOR_GRAPH_PARTITION,
|
||||||
|
AttentionBackendCase,
|
||||||
|
Matches,
|
||||||
|
custom_ops_combos,
|
||||||
|
is_blackwell,
|
||||||
|
)
|
||||||
|
from .models import (
|
||||||
|
FLASHINFER_ATTN,
|
||||||
|
TRITON_ATTN,
|
||||||
|
llama3_8b,
|
||||||
|
llama3_8b_fp8,
|
||||||
|
llama4_scout_fp8,
|
||||||
|
qwen3_a3b,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||||
|
[llama3_8b_fp8, llama4_scout_fp8],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
|
||||||
|
@pytest.mark.parametrize("n_layers", [4])
|
||||||
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
|
||||||
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||||
|
def test_tp2_async_tp_fp8_fusions(
|
||||||
|
model_name: str,
|
||||||
|
matches_fn: Callable[[int], Matches],
|
||||||
|
model_kwargs: dict,
|
||||||
|
hf_overrides: Callable[[int], dict],
|
||||||
|
attn_backend: AttentionBackendCase,
|
||||||
|
n_layers: int,
|
||||||
|
custom_ops: str,
|
||||||
|
inductor_graph_partition: bool,
|
||||||
|
run_e2e_fusion_test,
|
||||||
|
monkeypatch,
|
||||||
|
):
|
||||||
|
matches = matches_fn(n_layers)
|
||||||
|
|
||||||
|
if is_blackwell():
|
||||||
|
# Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns
|
||||||
|
monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel")
|
||||||
|
|
||||||
|
# Reduce size of model and skip weight loading time
|
||||||
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||||
|
model_kwargs["load_format"] = "dummy"
|
||||||
|
model_kwargs["max_model_len"] = 1024
|
||||||
|
|
||||||
|
compilation_config = dict(
|
||||||
|
use_inductor_graph_partition=inductor_graph_partition,
|
||||||
|
custom_ops=custom_ops.split(","),
|
||||||
|
pass_config=PassConfig(
|
||||||
|
fuse_norm_quant=True,
|
||||||
|
fuse_act_quant=True,
|
||||||
|
fuse_attn_quant=True,
|
||||||
|
enable_qk_norm_rope_fusion=True,
|
||||||
|
enable_sp=True,
|
||||||
|
fuse_gemm_comms=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
matches_check = [
|
||||||
|
"rms_quant_fusion",
|
||||||
|
"act_quant_fusion",
|
||||||
|
"norm_rope_fusion",
|
||||||
|
"attn_quant_fusion",
|
||||||
|
"sequence_parallel",
|
||||||
|
"async_tp",
|
||||||
|
]
|
||||||
|
|
||||||
|
run_e2e_fusion_test(
|
||||||
|
model_name,
|
||||||
|
matches,
|
||||||
|
model_kwargs,
|
||||||
|
attn_backend,
|
||||||
|
compilation_config,
|
||||||
|
matches_check,
|
||||||
|
tp_size=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||||
|
[llama3_8b, qwen3_a3b],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
|
||||||
|
@pytest.mark.parametrize("n_layers", [4])
|
||||||
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
||||||
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||||
|
def test_tp2_async_tp_fusions(
|
||||||
|
model_name: str,
|
||||||
|
matches_fn: Callable[[int], Matches],
|
||||||
|
model_kwargs: dict,
|
||||||
|
hf_overrides: Callable[[int], dict],
|
||||||
|
attn_backend: AttentionBackendCase,
|
||||||
|
n_layers: int,
|
||||||
|
custom_ops: str,
|
||||||
|
inductor_graph_partition: bool,
|
||||||
|
run_e2e_fusion_test,
|
||||||
|
):
|
||||||
|
matches = matches_fn(n_layers)
|
||||||
|
|
||||||
|
# Reduce size of model and skip weight loading time
|
||||||
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||||
|
model_kwargs["load_format"] = "dummy"
|
||||||
|
model_kwargs["max_model_len"] = 1024
|
||||||
|
|
||||||
|
compilation_config = dict(
|
||||||
|
use_inductor_graph_partition=inductor_graph_partition,
|
||||||
|
custom_ops=custom_ops.split(","),
|
||||||
|
pass_config=PassConfig(
|
||||||
|
enable_qk_norm_rope_fusion=True,
|
||||||
|
enable_sp=True,
|
||||||
|
fuse_gemm_comms=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
matches_check = [
|
||||||
|
"norm_rope_fusion",
|
||||||
|
"sequence_parallel",
|
||||||
|
"async_tp",
|
||||||
|
]
|
||||||
|
|
||||||
|
run_e2e_fusion_test(
|
||||||
|
model_name,
|
||||||
|
matches,
|
||||||
|
model_kwargs,
|
||||||
|
attn_backend,
|
||||||
|
compilation_config,
|
||||||
|
matches_check,
|
||||||
|
tp_size=2,
|
||||||
|
)
|
||||||
@@ -1,23 +1,11 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import copy
|
import copy
|
||||||
import logging
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import regex as re
|
|
||||||
import torch._dynamo
|
import torch._dynamo
|
||||||
|
|
||||||
from tests.compile.backend import LazyInitPass, TestBackend
|
from tests.compile.backend import LazyInitPass, TestBackend
|
||||||
from tests.compile.fusion_test_utils import (
|
|
||||||
CUSTOM_OPS_FP8,
|
|
||||||
MODELS_FP4,
|
|
||||||
MODELS_FP8,
|
|
||||||
Matches,
|
|
||||||
has_cuda_graph_wrapper_metadata,
|
|
||||||
is_blackwell,
|
|
||||||
run_model,
|
|
||||||
)
|
|
||||||
from tests.utils import flat_product
|
from tests.utils import flat_product
|
||||||
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
|
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
|
||||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||||
@@ -31,7 +19,6 @@ from vllm.config import (
|
|||||||
CacheConfig,
|
CacheConfig,
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
CompilationMode,
|
CompilationMode,
|
||||||
CUDAGraphMode,
|
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PassConfig,
|
PassConfig,
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
@@ -47,7 +34,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.flashinfer import has_flashinfer
|
from vllm.utils.flashinfer import has_flashinfer
|
||||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
|
||||||
from vllm.v1.attention.backend import AttentionMetadata
|
from vllm.v1.attention.backend import AttentionMetadata
|
||||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||||
@@ -501,88 +487,3 @@ def test_attention_quant_pattern(
|
|||||||
|
|
||||||
# Check that results are close
|
# Check that results are close
|
||||||
torch.testing.assert_close(result_unfused, result_fused, atol=1e-2, rtol=1e-2)
|
torch.testing.assert_close(result_unfused, result_fused, atol=1e-2, rtol=1e-2)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
|
||||||
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
|
|
||||||
list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
|
|
||||||
# quant_fp4 only has the custom impl
|
|
||||||
+ list(flat_product(MODELS_FP4, [""])),
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"inductor_graph_partition",
|
|
||||||
[
|
|
||||||
pytest.param(
|
|
||||||
True,
|
|
||||||
marks=pytest.mark.skipif(
|
|
||||||
not has_cuda_graph_wrapper_metadata(),
|
|
||||||
reason="This test requires"
|
|
||||||
"torch._inductor.utils.CUDAGraphWrapperMetadata to run",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
False,
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_attn_quant(
|
|
||||||
model_name: str,
|
|
||||||
model_kwargs: dict[str, Any],
|
|
||||||
backend: AttentionBackendEnum,
|
|
||||||
matches: Matches,
|
|
||||||
custom_ops: str,
|
|
||||||
inductor_graph_partition: bool,
|
|
||||||
caplog_mp_spawn,
|
|
||||||
monkeypatch,
|
|
||||||
):
|
|
||||||
if not current_platform.has_device_capability(90):
|
|
||||||
pytest.skip("test_attn_quant requires H100 (SM90) or B200 (SM100) GPU")
|
|
||||||
if backend == AttentionBackendEnum.FLASHINFER and (
|
|
||||||
not is_blackwell() or not has_flashinfer()
|
|
||||||
):
|
|
||||||
pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
|
|
||||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
|
||||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
|
||||||
|
|
||||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
|
||||||
|
|
||||||
if inductor_graph_partition:
|
|
||||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
|
||||||
splitting_ops: list[str] | None = None
|
|
||||||
else:
|
|
||||||
# FIXME: Llama-4-Scout-17B-16E-Instruct-FP8 + FlashInfer + Blackwell end at
|
|
||||||
# CUDAGraphMode.NONE here because it derives an attention backend that
|
|
||||||
# does not support full cudagraphs
|
|
||||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
|
||||||
splitting_ops = []
|
|
||||||
|
|
||||||
# Disable, compile cache to make sure custom passes run.
|
|
||||||
# Otherwise, we can't verify fusion happened through the logs.
|
|
||||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
|
||||||
|
|
||||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
|
||||||
# Force spawn as it is more general.
|
|
||||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
|
||||||
model_kwargs["attention_config"] = {"backend": backend.name}
|
|
||||||
|
|
||||||
compilation_config = CompilationConfig(
|
|
||||||
# Testing properties
|
|
||||||
custom_ops=custom_ops_list,
|
|
||||||
use_inductor_graph_partition=inductor_graph_partition,
|
|
||||||
cudagraph_mode=mode,
|
|
||||||
splitting_ops=splitting_ops,
|
|
||||||
# Common
|
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
|
||||||
pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
|
|
||||||
# Inductor caches custom passes by default as well via uuid
|
|
||||||
inductor_compile_config={"force_disable_caches": True},
|
|
||||||
)
|
|
||||||
|
|
||||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
|
||||||
run_model(compilation_config, model_name, **model_kwargs)
|
|
||||||
|
|
||||||
log_matches = re.findall(
|
|
||||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
|
||||||
log_holder.text,
|
|
||||||
)
|
|
||||||
assert len(log_matches) == 1, log_holder.text
|
|
||||||
assert int(log_matches[0]) == matches.attention_fusion
|
|
||||||
|
|||||||
@@ -1002,7 +1002,7 @@ def test_vllm_config_explicit_overrides():
|
|||||||
assert config.compilation_config.pass_config.fuse_attn_quant is True
|
assert config.compilation_config.pass_config.fuse_attn_quant is True
|
||||||
|
|
||||||
# Explicit cudagraph mode override on quantized model at O2
|
# Explicit cudagraph mode override on quantized model at O2
|
||||||
pass_config = PassConfig(fuse_gemm_comms=True)
|
pass_config = PassConfig(enable_qk_norm_rope_fusion=True)
|
||||||
compilation_config = CompilationConfig(
|
compilation_config = CompilationConfig(
|
||||||
cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config
|
cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config
|
||||||
)
|
)
|
||||||
@@ -1012,7 +1012,7 @@ def test_vllm_config_explicit_overrides():
|
|||||||
compilation_config=compilation_config,
|
compilation_config=compilation_config,
|
||||||
)
|
)
|
||||||
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
|
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
|
||||||
assert config.compilation_config.pass_config.fuse_gemm_comms is True
|
assert config.compilation_config.pass_config.enable_qk_norm_rope_fusion is True
|
||||||
# Mode should still use default for O2
|
# Mode should still use default for O2
|
||||||
assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
||||||
|
|
||||||
|
|||||||
@@ -766,7 +766,12 @@ class VllmConfig:
|
|||||||
if self.compilation_config.pass_config.fuse_gemm_comms:
|
if self.compilation_config.pass_config.fuse_gemm_comms:
|
||||||
self.compilation_config.pass_config.enable_sp = True
|
self.compilation_config.pass_config.enable_sp = True
|
||||||
if self.compilation_config.pass_config.enable_sp:
|
if self.compilation_config.pass_config.enable_sp:
|
||||||
if "-rms_norm" in self.compilation_config.custom_ops:
|
if self.parallel_config.tensor_parallel_size == 1:
|
||||||
|
logger.warning("Sequence Parallelism requires TP>1, disabling")
|
||||||
|
self.compilation_config.pass_config.enable_sp = False
|
||||||
|
self.compilation_config.pass_config.fuse_gemm_comms = False
|
||||||
|
|
||||||
|
elif "-rms_norm" in self.compilation_config.custom_ops:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"RMS norm force disabled, sequence parallelism might break"
|
"RMS norm force disabled, sequence parallelism might break"
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user