[vLLM IR] 1/N Implement IR skeleton and rms_norm op (#33825)

Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>
Signed-off-by: chzhang <chaojun.zhang@intel.com>
Signed-off-by: Luka Govedic <luka.govedic@gmail.com>
Co-authored-by: Xinyu Chen <xinyu1.chen@intel.com>
Co-authored-by: Chaojun Zhang <chaojun.zhang@intel.com>
Co-authored-by: Luka Govedič <ProExpertProg@h100-01.nemg-001.lab.rdu2.dc.redhat.com>
This commit is contained in:
Luka Govedič
2026-03-31 22:15:05 -04:00
committed by GitHub
parent 0fab52f0aa
commit 40bb175027
49 changed files with 2177 additions and 265 deletions

View File

@@ -99,6 +99,8 @@ def test_tp1_fp8_fusions(
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
model_kwargs["load_format"] = "dummy"
model_kwargs["max_model_len"] = 1024
model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
compilation_config = dict(
use_inductor_graph_partition=inductor_graph_partition,
custom_ops=custom_ops.split(","),
@@ -166,6 +168,7 @@ def test_tp1_fp4_fusions(
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
model_kwargs["load_format"] = "dummy"
model_kwargs["max_model_len"] = 1024
model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
compilation_config = dict(
use_inductor_graph_partition=inductor_graph_partition,