[vLLM IR] 1/N Implement IR skeleton and rms_norm op (#33825)
Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com> Signed-off-by: chzhang <chaojun.zhang@intel.com> Signed-off-by: Luka Govedic <luka.govedic@gmail.com> Co-authored-by: Xinyu Chen <xinyu1.chen@intel.com> Co-authored-by: Chaojun Zhang <chaojun.zhang@intel.com> Co-authored-by: Luka Govedič <ProExpertProg@h100-01.nemg-001.lab.rdu2.dc.redhat.com>
This commit is contained in:
@@ -58,6 +58,7 @@ def test_tp2_async_tp_fp8_fusions(
|
||||
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||
model_kwargs["load_format"] = "dummy"
|
||||
model_kwargs["max_model_len"] = 1024
|
||||
model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
|
||||
|
||||
compilation_config = dict(
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
@@ -121,6 +122,7 @@ def test_tp2_async_tp_fusions(
|
||||
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||
model_kwargs["load_format"] = "dummy"
|
||||
model_kwargs["max_model_len"] = 1024
|
||||
model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
|
||||
|
||||
compilation_config = dict(
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
|
||||
Reference in New Issue
Block a user