[6/N] torch.compile rollout to users (#10437)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-19 10:09:03 -08:00
parent fd9f124971
commit 803f37eaaa
15 changed files with 129 additions and 141 deletions
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -4,7 +4,7 @@ import torch

 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
+from vllm.config import CompilationConfig, CompilationLevel
 from vllm.platforms import current_platform

 TEST_MODELS = [
@@ -65,7 +65,6 @@ def check_full_graph_support(model,
                             optimization_level,
                             tp_size=1):
    # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"

    # The base meta llama uses too much memory.
@@ -86,6 +85,7 @@ def check_full_graph_support(model,
              enforce_eager=True,
              tensor_parallel_size=tp_size,
              disable_custom_all_reduce=True,
+              compilation_config=CompilationConfig(level=optimization_level),
              **model_kwargs)

    outputs = llm.generate(prompts, sampling_params)