diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 8e2e0c4ab..7d9fd0d2f 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -28,7 +28,7 @@ from vllm.config.compilation import DynamicShapesType from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils.import_utils import resolve_obj_by_qualname -from vllm.utils.torch_utils import is_torch_equal_or_newer, supports_dynamo +from vllm.utils.torch_utils import is_torch_equal_or_newer from .monitor import start_monitoring_torch_compile @@ -312,7 +312,6 @@ def _support_torch_compile( self.do_not_compile = ( self.compilation_config.mode in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE] - or not supports_dynamo() or _should_ignore_torch_compile(self.__class__) or not enable_compile ) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index c0f330408..d8c6ceba3 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -53,7 +53,6 @@ from vllm.utils.network_utils import get_distributed_init_method from vllm.utils.system_utils import suppress_stdout from vllm.utils.torch_utils import ( direct_register_custom_op, - supports_custom_op, ) @@ -246,33 +245,32 @@ def patched_fused_scaled_matmul_reduce_scatter( ) -if supports_custom_op(): - direct_register_custom_op( - op_name="all_reduce", - op_func=all_reduce, - fake_impl=all_reduce_fake, - ) +direct_register_custom_op( + op_name="all_reduce", + op_func=all_reduce, + fake_impl=all_reduce_fake, +) - direct_register_custom_op( - op_name="reduce_scatter", - op_func=reduce_scatter, - fake_impl=reduce_scatter_fake, - ) +direct_register_custom_op( + op_name="reduce_scatter", + op_func=reduce_scatter, + fake_impl=reduce_scatter_fake, +) - direct_register_custom_op( - op_name="all_gather", - op_func=all_gather, - fake_impl=all_gather_fake, - ) +direct_register_custom_op( + op_name="all_gather", + op_func=all_gather, + fake_impl=all_gather_fake, +) - # TODO: Remove this once the pytorch fix - # (https://github.com/pytorch/pytorch/pull/165086) gets released, - # in either 2.9.1 or 2.10 - direct_register_custom_op( - op_name="patched_fused_scaled_matmul_reduce_scatter", - op_func=patched_fused_scaled_matmul_reduce_scatter, - fake_impl=patched_fused_scaled_matmul_reduce_scatter_fake, - ) +# TODO: Remove this once the pytorch fix +# (https://github.com/pytorch/pytorch/pull/165086) gets released, +# in either 2.9.1 or 2.10 +direct_register_custom_op( + op_name="patched_fused_scaled_matmul_reduce_scatter", + op_func=patched_fused_scaled_matmul_reduce_scatter, + fake_impl=patched_fused_scaled_matmul_reduce_scatter_fake, +) class GroupCoordinator: diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index f0c7e9366..3f13572f6 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -704,13 +704,6 @@ def is_torch_equal(target: str) -> bool: return Version(importlib.metadata.version("torch")) == Version(target) -# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0. -# In particular, the FakeScalarType is not supported for earlier versions of -# PyTorch which breaks dynamo for any ops registered using ScalarType. -def supports_dynamo() -> bool: - return is_torch_equal_or_newer("2.4.0") - - # Supports xccl with PyTorch versions >= 2.8.0.dev for XPU platform def supports_xccl() -> bool: return ( @@ -718,12 +711,6 @@ def supports_xccl() -> bool: ) -# Some backends use pytorch version < 2.4.0 which doesn't -# support `torch.library.custom_op`. -def supports_custom_op() -> bool: - return hasattr(torch.library, "custom_op") - - # create a library to hold the custom op vllm_lib = Library("vllm", "FRAGMENT") # noqa @@ -752,18 +739,6 @@ def direct_register_custom_op( library object. If you want to bind the operator to a different library, make sure the library object is alive when the operator is used. """ - if not supports_custom_op(): - from vllm.platforms import current_platform - - assert not current_platform.is_cuda_alike(), ( - "cuda platform needs torch>=2.4 to support custom op, " - "chances are you are using an old version of pytorch " - "or a custom build of pytorch. It is recommended to " - "use vLLM in a fresh new environment and let it install " - "the required dependencies." - ) - return - if mutates_args is None: mutates_args = [] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index de2a1e371..c20dbbbd4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -96,7 +96,6 @@ from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import ( get_dtype_size, kv_cache_dtype_str_to_dtype, - supports_dynamo, ) from vllm.v1.attention.backend import ( AttentionBackend, @@ -3944,7 +3943,6 @@ class GPUModelRunner( if ( self.vllm_config.compilation_config.mode == CompilationMode.STOCK_TORCH_COMPILE - and supports_dynamo() ): backend = self.vllm_config.compilation_config.init_backend(self.vllm_config) compilation_counter.stock_torch_compile_count += 1