[torch.compile] Add support for non-contiguous fused RMSNorm + group quant (#36551)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
2026-03-11 13:56:55 -04:00
parent a1a3523a56
commit 9556af87d5
9 changed files with 219 additions and 87 deletions
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -101,8 +101,8 @@ steps:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"

 - label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
@@ -132,9 +132,9 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"

 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
@@ -150,8 +150,8 @@ steps:
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"

 - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
@@ -205,7 +205,7 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"