diff --git a/benchmarks/kernels/cpu/benchmark_cpu_attn.py b/benchmarks/kernels/cpu/benchmark_cpu_attn.py index d03b70a9f..63d034278 100644 --- a/benchmarks/kernels/cpu/benchmark_cpu_attn.py +++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py @@ -27,7 +27,7 @@ def get_attn_isa( else: if current_platform.get_cpu_architecture() == CpuArchEnum.ARM: return "neon" - elif torch._C._cpu._is_amx_tile_supported(): + elif torch.cpu._is_amx_tile_supported(): return "amx" else: return "vec" diff --git a/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py index df6a9c60a..aff443083 100644 --- a/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py +++ b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py @@ -24,7 +24,7 @@ except (ImportError, AttributeError) as e: sys.exit(1) # ISA selection following test_cpu_fused_moe.py pattern -ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"] +ISA_CHOICES = ["amx", "vec"] if torch.cpu._is_amx_tile_supported() else ["vec"] @torch.inference_mode() diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py index 9636dfb95..7e3d77134 100644 --- a/tests/kernels/attention/test_cpu_attn.py +++ b/tests/kernels/attention/test_cpu_attn.py @@ -48,7 +48,7 @@ def get_attn_isa( else: if current_platform.get_cpu_architecture() == CpuArchEnum.ARM: return "neon" - elif torch._C._cpu._is_amx_tile_supported(): + elif torch.cpu._is_amx_tile_supported(): return "amx" else: return "vec" @@ -400,9 +400,7 @@ def test_varlen_with_paged_kv_normal_vec( @pytest.mark.parametrize("use_alibi", [False]) @pytest.mark.parametrize("use_sink", [False]) @pytest.mark.parametrize("isa", ["amx"]) -@pytest.mark.skipif( - not torch._C._cpu._is_amx_tile_supported(), reason="no AMX support." -) +@pytest.mark.skipif(not torch.cpu._is_amx_tile_supported(), reason="no AMX support.") def test_varlen_with_paged_kv_normal_amx( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], diff --git a/tests/kernels/moe/test_cpu_fused_moe.py b/tests/kernels/moe/test_cpu_fused_moe.py index 839eceeeb..467ba3c5f 100644 --- a/tests/kernels/moe/test_cpu_fused_moe.py +++ b/tests/kernels/moe/test_cpu_fused_moe.py @@ -22,7 +22,7 @@ INTERMEDIATE_DIM = [128, 2880] BATCH_SIZE = [1, 64, 256] ACT = [MoEActivation.SILU, MoEActivation.SWIGLUOAI] USE_BIAS = [True, False] -ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"] +ISA = ["amx", "vec"] if torch.cpu._is_amx_tile_supported() else ["vec"] DTYPE = [torch.bfloat16] diff --git a/vllm/model_executor/kernels/linear/mixed_precision/cpu.py b/vllm/model_executor/kernels/linear/mixed_precision/cpu.py index d5ca625f0..afd41b72f 100644 --- a/vllm/model_executor/kernels/linear/mixed_precision/cpu.py +++ b/vllm/model_executor/kernels/linear/mixed_precision/cpu.py @@ -119,7 +119,7 @@ class CPUWNA16LinearKernel(MPLinearKernel): def _get_isa_hint(dtype: torch.dtype) -> str: - supports_amx = torch._C._cpu._is_amx_tile_supported() + supports_amx = torch.cpu._is_amx_tile_supported() if supports_amx and dtype in (torch.bfloat16,): return "amx" else: diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index f220a2fdd..72e9db514 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -280,7 +280,7 @@ class CPUFusedMOE: if not (w13_output_size % 32 == 0 and w2_output_size % 32 == 0): return False, "none" - supports_amx = torch._C._cpu._is_amx_tile_supported() + supports_amx = torch.cpu._is_amx_tile_supported() if ( supports_amx diff --git a/vllm/model_executor/layers/quantization/cpu_wna16.py b/vllm/model_executor/layers/quantization/cpu_wna16.py index 21e59a6f1..ea7afef27 100644 --- a/vllm/model_executor/layers/quantization/cpu_wna16.py +++ b/vllm/model_executor/layers/quantization/cpu_wna16.py @@ -292,7 +292,7 @@ class CPUAWQLinearMethod(LinearMethodBase): def _get_isa_hint(dtype: torch.dtype) -> str: - supports_amx = torch._C._cpu._is_amx_tile_supported() + supports_amx = torch.cpu._is_amx_tile_supported() if supports_amx and dtype in (torch.bfloat16,): return "amx" else: diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 5a526f127..757d1ecc5 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -212,7 +212,7 @@ direct_register_custom_op( def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype) -> bool: return ( - torch._C._cpu._is_amx_tile_supported() + torch.cpu._is_amx_tile_supported() and (dtype in (torch.bfloat16, torch.int8)) and k % 32 == 0 and n % 16 == 0 diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 689109aac..5fa3844c8 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -482,7 +482,7 @@ def _get_attn_isa( ) -> str: if head_size is not None and head_size % 32 != 0 and head_size % 16 == 0: return "vec16" - supports_amx = torch._C._cpu._is_amx_tile_supported() + supports_amx = torch.cpu._is_amx_tile_supported() supports_arm = current_platform.get_cpu_architecture() == CpuArchEnum.ARM supports_vxe = current_platform.get_cpu_architecture() == CpuArchEnum.S390X if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0: