diff --git a/vllm/compilation/passes/fusion/sequence_parallelism.py b/vllm/compilation/passes/fusion/sequence_parallelism.py index 63de85932..b7ae3dc62 100644 --- a/vllm/compilation/passes/fusion/sequence_parallelism.py +++ b/vllm/compilation/passes/fusion/sequence_parallelism.py @@ -18,7 +18,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( kFp8StaticTensorSym, ) -from vllm.platforms import current_platform from ..inductor_pass import enable_fake_mode from ..utility.noop_elimination import NoOpEliminationPass @@ -215,9 +214,6 @@ class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper): ) -FP8_DTYPE = current_platform.fp8_dtype() - - class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper): def __init__( self, diff --git a/vllm/config/model.py b/vllm/config/model.py index 012b2b1c9..4e3568fa1 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -461,8 +461,6 @@ class ModelConfig: self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) - from vllm.platforms import current_platform - if self.override_attention_dtype is not None and not current_platform.is_rocm(): warnings.warn( "override-attention-dtype is set but not using ROCm platform", @@ -940,8 +938,6 @@ class ModelConfig: f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}." ) - from vllm.platforms import current_platform - current_platform.verify_quantization(self.quantization) if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS: @@ -1811,8 +1807,6 @@ def _resolve_auto_dtype( *, is_pooling_model: bool, ): - from vllm.platforms import current_platform - supported_dtypes = [ dtype for dtype in current_platform.supported_dtypes diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 9e6b6df08..40b797a1a 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -385,8 +385,6 @@ class GroupCoordinator: self.cpu_group, 1 << 22, 6 ) - from vllm.platforms import current_platform - self.use_custom_op_call = ( current_platform.is_cuda_alike() or current_platform.is_tpu() ) diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py index 9658a7e3c..4039316c3 100644 --- a/vllm/v1/attention/backends/fa_utils.py +++ b/vllm/v1/attention/backends/fa_utils.py @@ -55,9 +55,6 @@ elif current_platform.is_rocm(): def get_flash_attn_version( requires_alibi: bool = False, head_size: int | None = None ) -> int | None: - # import here to avoid circular dependencies - from vllm.platforms import current_platform - if current_platform.is_xpu(): return 2 if current_platform.is_rocm(): diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 233251d07..4362bacb7 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -374,8 +374,6 @@ class FlashInferBackend(AttentionBackend): @classmethod def get_required_kv_cache_layout(cls) -> KVCacheLayoutType | None: - from vllm.platforms import current_platform - capability = current_platform.get_device_capability() if capability is not None and capability.major == 10: return "HND"