[FrontEnd] UNREVERT CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26502)

Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
This commit is contained in:
Morrison Turnansky
2025-10-13 18:47:16 -04:00
committed by GitHub
parent 7200a21cd1
commit e3fdb627d9
8 changed files with 153 additions and 86 deletions

View File

@@ -258,13 +258,13 @@ def tractable_computation(
@torch.inference_mode
def run_model(
llama_config, use_compile: bool, use_inductor: bool, split_attn: bool = False
llama_config, use_compile: bool, backend: str, split_attn: bool = False
) -> torch.Tensor:
if use_compile:
compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE,
use_cudagraph=True,
use_inductor=use_inductor,
backend=backend,
cudagraph_capture_sizes=[1, 2],
)
if split_attn:
@@ -338,8 +338,8 @@ def run_model(
return output.cpu()
@pytest.mark.parametrize("use_inductor", [True, False])
def test_toy_llama(use_inductor: bool):
@pytest.mark.parametrize("backend", ["inductor", "eager"])
def test_toy_llama(backend: str):
# compare output with and without piecewise compilation
llama_config = LlamaConfig(
@@ -358,10 +358,10 @@ def test_toy_llama(use_inductor: bool):
num_backend_compilations=0,
num_cudagraph_captured=0,
):
outputs.append(run_model(llama_config, use_inductor=False, use_compile=False))
run_model(tractable_config, use_inductor=False, use_compile=False)
outputs.append(run_model(llama_config, backend="eager", use_compile=False))
run_model(tractable_config, backend="eager", use_compile=False)
if use_inductor:
if backend == "inductor":
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
else:
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
@@ -377,10 +377,8 @@ def test_toy_llama(use_inductor: bool):
num_cudagraph_captured=2,
**kwargs,
):
outputs.append(
run_model(llama_config, use_inductor=use_inductor, use_compile=True)
)
run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
outputs.append(run_model(llama_config, backend=backend, use_compile=True))
run_model(tractable_config, backend=backend, use_compile=True)
with compilation_counter.expect(
num_graphs_seen=1, # one graph for the model
@@ -395,16 +393,9 @@ def test_toy_llama(use_inductor: bool):
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
outputs.append(
run_model(
llama_config,
use_inductor=use_inductor,
use_compile=True,
split_attn=True,
)
run_model(llama_config, backend=backend, use_compile=True, split_attn=True)
)
run_model(
tractable_config, use_inductor=use_inductor, use_compile=True, split_attn=True
)
run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)
for i in range(1, len(outputs)):
assert torch.allclose(outputs[0], outputs[i])

View File

@@ -77,14 +77,15 @@ class TestSetting:
method="encode",
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
),
# See https://github.com/vllm-project/vllm/issues/26716.
# TestSetting(
# model="microsoft/Phi-3.5-vision-instruct",
# model_args=["--trust-remote-code", "--max-model-len", "2048"],
# pp_size=2,
# tp_size=1,
# attn_backend="FLASH_ATTN",
# method="generate_with_image",
# ),
],
)
def test_compile_correctness(
@@ -109,41 +110,46 @@ def test_compile_correctness(
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
final_args = [
"--enforce-eager",
*model_args,
"-pp",
str(pp_size),
"-tp",
str(tp_size),
"-O.cudagraph_mode=none",
]
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
for level in [
CompilationLevel.NO_COMPILATION,
for comp_level in [
CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE,
CompilationLevel.PIECEWISE,
]:
all_args.append(final_args + [f"-O{level}"])
all_envs.append({})
for level in [CompilationLevel.NO_COMPILATION, comp_level]:
all_args.append(
final_args + [f"-O.level={level}", "-O.backend=inductor"]
)
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close",
)
all_envs.clear()
all_args.clear()
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close",
)
all_envs.clear()
all_args.clear()
for level in [
CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE,
CompilationLevel.PIECEWISE,
]:
all_args.append(final_args + [f"-O{level}"])
all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
all_envs.append({})
all_envs.append({})
compare_all_settings(model, all_args * 3, all_envs, method=method)