[torch.compile] integration with compilation control (#9058)
This commit is contained in:
16
vllm/envs.py
16
vllm/envs.py
@@ -65,6 +65,7 @@ if TYPE_CHECKING:
|
||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||
VLLM_SKIP_P2P_CHECK: bool = False
|
||||
VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
|
||||
VLLM_TORCH_COMPILE_LEVEL: int = 0
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -198,23 +199,12 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
|
||||
("true", "1")),
|
||||
|
||||
# Internal flag to enable Dynamo graph capture
|
||||
"VLLM_TEST_DYNAMO_GRAPH_CAPTURE":
|
||||
lambda: int(os.environ.get("VLLM_TEST_DYNAMO_GRAPH_CAPTURE", "0")),
|
||||
"VLLM_DYNAMO_USE_CUSTOM_DISPATCHER":
|
||||
lambda:
|
||||
(os.environ.get("VLLM_DYNAMO_USE_CUSTOM_DISPATCHER", "True").lower() in
|
||||
("true", "1")),
|
||||
|
||||
# Internal flag to control whether we use custom op,
|
||||
# or use the native pytorch implementation
|
||||
"VLLM_TEST_COMPILE_NO_CUSTOM_OPS":
|
||||
lambda: int(os.environ.get("VLLM_TEST_COMPILE_NO_CUSTOM_OPS", "0")),
|
||||
|
||||
# Internal flag to enable Dynamo fullgraph capture
|
||||
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
|
||||
lambda: bool(
|
||||
os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
|
||||
"VLLM_TORCH_COMPILE_LEVEL":
|
||||
lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")),
|
||||
|
||||
# local rank of the process in the distributed setting, used to determine
|
||||
# the GPU device id
|
||||
|
||||
Reference in New Issue
Block a user