diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index ffcfa4572..867f55fa9 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -701,7 +701,7 @@ def _run_single_benchmark(
     # Warmup
     for _ in range(config.warmup_iters):
         forward_fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Benchmark
     times = []
@@ -714,7 +714,7 @@ def _run_single_benchmark(
             forward_fn()
         end.record()
 
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         elapsed_ms = start.elapsed_time(end)
         times.append(elapsed_ms / 1000.0 / config.num_layers)
 
diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
index 6457a599a..9744b857d 100644
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -391,7 +391,7 @@ def _run_single_benchmark(
                 attn_metadata,
                 output=out,
             )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Benchmark
     times = []
@@ -412,7 +412,7 @@ def _run_single_benchmark(
             )
         end.record()
 
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         elapsed_ms = start.elapsed_time(end)
         times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
 
diff --git a/benchmarks/benchmark_topk_topp.py b/benchmarks/benchmark_topk_topp.py
index aa020e012..f1d59cbde 100644
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -94,7 +94,7 @@ def create_logits(
 
 def measure_memory() -> tuple[int, int]:
     """Return (allocated, reserved) memory in bytes."""
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
 
 
@@ -123,7 +123,7 @@ def benchmark_function(
     for _ in range(warmup_iters):
         logits_copy = logits.clone()
         func(logits_copy, k, p)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Reset memory stats before benchmark
     reset_memory_stats()
@@ -140,7 +140,7 @@ def benchmark_function(
         func(logits_copy, k, p)
         end_events[i].record()
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Calculate timing
     times = [
diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
index 04921dafb..8aaf82197 100644
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -168,7 +168,7 @@ def bench_impl(
     # warmup
     for kwargs in kwargs_list:
         impl_type.get_impl()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Merge into a single kwargs and qualify arguments as ArgPool
     kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
index bd116e36a..58ccfcc45 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -171,7 +171,7 @@ def bench_run(
                 activation=MoEActivation.SILU,
                 global_num_experts=num_experts,
             )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
     triton_stream = torch.cuda.Stream()
@@ -187,14 +187,14 @@ def bench_run(
                 topk_ids,
                 quant_config=quant_config,
             )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
         """Benchmark CUDA graph using events like benchmark_moe.py"""
         # Warmup
         for _ in range(num_warmup):
             graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         # Timing
         start_event = torch.Event(enable_timing=True)
@@ -202,7 +202,7 @@ def bench_run(
 
         latencies = []
         for _ in range(num_iters):
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             start_event.record()
             graph.replay()
             end_event.record()
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
index cfb1489da..2d4afd38c 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -307,7 +307,7 @@ def bench_run(
     def replay_graph(graph, num_repeats):
         for _ in range(num_repeats):
             graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     cutlass_stream = torch.cuda.Stream()
     cutlass_graph = torch.cuda.CUDAGraph()
@@ -330,7 +330,7 @@ def bench_run(
             e=num_experts,
             device=device,
         )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     triton_stream = torch.cuda.Stream()
     triton_graph = torch.cuda.CUDAGraph()
@@ -345,7 +345,7 @@ def bench_run(
             w2_fp8scale,
             a_fp8_scale,
         )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     min_run_time = 5
     num_warmup = 5
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
index d1005461a..9b5ccac4e 100644
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
             if not should_use_fn(tensor):
                 return None
 
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             stream = torch.cuda.Stream()
             with torch.cuda.stream(stream):
                 graph_input = tensor.clone()
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
                         for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                             allreduce_fn(graph_input)
 
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             for _ in range(num_warmup):
                 graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
 
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             start_time = time.perf_counter()
 
             for _ in range(num_trials):
                 graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
 
             end_time = time.perf_counter()
 
diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py
index e18f6a758..2547f553f 100644
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -385,7 +385,7 @@ def benchmark_operation(
     # Warmup before graph capture
     for _ in range(warmup):
         operation_func(*args, **kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Create CUDA graph
     graph = torch.cuda.CUDAGraph()
@@ -398,19 +398,19 @@ def benchmark_operation(
             operation_func(*args, **kwargs)
 
     # Graph warmup
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     for _ in range(warmup):
         graph.replay()
 
     # Benchmark with CUDA graph
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     start_time = time.perf_counter()
 
     for _ in range(trials // num_op_per_cudagraph):
         # operation_func(*args, **kwargs)
         graph.replay()
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     end_time = time.perf_counter()
 
     avg_time_ms = ((end_time - start_time) / trials) * 1000
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 60ec94b87..039eb2f29 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -224,7 +224,7 @@ def bench_run(
     def replay_graph(graph, num_repeats):
         for _ in range(num_repeats):
             graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     cutlass_stream = torch.cuda.Stream()
     cutlass_graph = torch.cuda.CUDAGraph()
@@ -239,7 +239,7 @@ def bench_run(
             topk_weights,
             topk_ids,
         )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     triton_stream = torch.cuda.Stream()
     triton_graph = torch.cuda.CUDAGraph()
@@ -254,7 +254,7 @@ def bench_run(
             w2_scale,
             a_scale,
         )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     min_run_time = 5
     num_warmup = 5
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index cc1c1cf09..a662e3ac4 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -34,14 +34,14 @@ def main(
     residual = torch.randn_like(x) * scale if add_residual else None
 
     def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         if profile:
             torch.cuda.cudart().cudaProfilerStart()
         start_time = time.perf_counter()
 
         for _ in range(num_iters):
             layer(x, residual)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         end_time = time.perf_counter()
         if profile:
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 8ca3cf78f..ab930c59d 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1035,7 +1035,7 @@ def bench_optype(
     # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
     for kwargs in kwargs_list:
         op_type.bench_fn()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Merge into a single kwargs and qualify arguments as ArgPool
     kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py
index fb3b6c8f1..7debf3634 100644
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -47,13 +47,13 @@ def benchmark_method(
     # Warmup
     for _ in range(num_warmup):
         _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Benchmark
     start = time.perf_counter()
     for _ in range(num_iters):
         _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     end = time.perf_counter()
 
     return (end - start) / num_iters * 1000  # Convert to ms
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 3bd3e3f67..9ef825417 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -304,19 +304,19 @@ def benchmark_config(
 
     # JIT compilation & warmup
     run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Capture 10 invocations with CUDA graph
     graph = torch.cuda.CUDAGraph()
     with torch.cuda.graph(graph):
         for _ in range(10):
             run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Warmup
     for _ in range(5):
         graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     start_event = torch.Event(enable_timing=True)
     end_event = torch.Event(enable_timing=True)
@@ -324,7 +324,7 @@ def benchmark_config(
     latencies: list[float] = []
     for i in range(num_iters):
         prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         start_event.record()
         graph.replay()
diff --git a/benchmarks/kernels/benchmark_moe_defaults.py b/benchmarks/kernels/benchmark_moe_defaults.py
index 9527878bc..f6ad59366 100644
--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
@@ -131,7 +131,7 @@ def benchmark_config(
                 topk_ids,
                 quant_config=quant_config,
             )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Benchmark
     start = torch.cuda.Event(enable_timing=True)
@@ -149,7 +149,7 @@ def benchmark_config(
                 quant_config=quant_config,
             )
     end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     return start.elapsed_time(end) / num_iters * 1000  # ms -> us
 
 
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index d9a1d3303..990be5932 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -69,19 +69,19 @@ def benchmark_permute(
 
     # JIT compilation & warmup
     run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Capture 10 invocations with CUDA graph
     graph = torch.cuda.CUDAGraph()
     with torch.cuda.graph(graph):
         for _ in range(10):
             run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Warmup
     for _ in range(5):
         graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     start_event = torch.Event(enable_timing=True)
     end_event = torch.Event(enable_timing=True)
@@ -89,7 +89,7 @@ def benchmark_permute(
     latencies: list[float] = []
     for i in range(num_iters):
         prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         start_event.record()
         graph.replay()
@@ -159,26 +159,26 @@ def benchmark_unpermute(
     # JIT compilation & warmup
     input = prepare()
     run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Capture 10 invocations with CUDA graph
     graph = torch.cuda.CUDAGraph()
     with torch.cuda.graph(graph):
         for _ in range(10):
             run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Warmup
     for _ in range(5):
         graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     start_event = torch.Event(enable_timing=True)
     end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start_event.record()
         graph.replay()
         end_event.record()
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index 2c086870c..6548c74f8 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -135,14 +135,14 @@ def benchmark_mrope(
             key.clone(),
         )
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Time reference implementation
     torch_times = []
     for _ in range(benchmark_iter):
         query_clone = query.clone()
         key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start_time = time.time()
 
         mrope_helper_class.forward_native(
@@ -151,7 +151,7 @@ def benchmark_mrope(
             key_clone,
         )
 
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         torch_times.append(time.time() - start_time)
 
     # Time triton kernel implementation
@@ -159,14 +159,14 @@ def benchmark_mrope(
     for _ in range(benchmark_iter):
         query_clone = query.clone()
         key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start_time = time.time()
         mrope_helper_class.forward_cuda(
             positions,
             query_clone,
             key_clone,
         )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         triton_times.append(time.time() - start_time)
 
     # Calculate statistics
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index be871d3d1..b6a0b7ad8 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -103,7 +103,7 @@ def main(
         max_logits = torch.empty_like(exp_sums)
 
     def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         if profile:
             torch.cuda.cudart().cudaProfilerStart()
         start_time = time.perf_counter()
@@ -173,7 +173,7 @@ def main(
                     )
             else:
                 raise ValueError(f"Invalid version: {version}")
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         end_time = time.perf_counter()
         if profile:
diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py
index eba4d5102..f2195a6d7 100644
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -28,7 +28,7 @@ def _time_cuda(
     # warmup
     for _ in range(warmup_iters):
         fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     start = torch.Event(enable_timing=True)
     end = torch.Event(enable_timing=True)
@@ -37,7 +37,7 @@ def _time_cuda(
     for _ in range(bench_iters):
         fn()
     end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     return start.elapsed_time(end) / bench_iters  # ms/iter
 
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 9a21cfe94..d01c7ac37 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -29,7 +29,7 @@ def main(
     scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
 
     def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         if profile:
             torch.cuda.cudart().cudaProfilerStart()
         start_time = time.perf_counter()
@@ -39,7 +39,7 @@ def main(
                 ops.scaled_int8_quant(x, scale)
             else:
                 ops.scaled_fp8_quant(x, scale)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         end_time = time.perf_counter()
         if profile:
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py
index b4c949e4f..97af4ac97 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -84,16 +84,16 @@ def run_benchmark(
         g = torch.cuda.CUDAGraph()
         with torch.cuda.graph(g):
             function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         function_under_test = lambda: g.replay()
 
     def run_cuda_benchmark(n_iters: int) -> float:
         nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start = time.perf_counter()
         for _ in range(n_iters):
             function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
         end = time.perf_counter()
         return (end - start) / n_iters
 
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
index 2a250620b..55c203725 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -109,16 +109,16 @@ def run_benchmark(
         g = torch.cuda.CUDAGraph()
         with torch.cuda.graph(g):
             function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         function_under_test = lambda: g.replay()
 
     def run_cuda_benchmark(n_iters: int) -> float:
         nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start = time.perf_counter()
         for _ in range(n_iters):
             function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
         end = time.perf_counter()
         return (end - start) / n_iters
 
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index da32bc30c..13b97b769 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -251,7 +251,7 @@ def benchmark(
         kernel(
             y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
         )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     start_event = torch.Event(enable_timing=True)
     end_event = torch.Event(enable_timing=True)
@@ -259,7 +259,7 @@ def benchmark(
     # Benchmark
     latencies: list[float] = []
     for _ in range(runs):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         start_event.record()
         for i in range(iterations_per_run):
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 1d0d6fbb9..89970e2b0 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -126,7 +126,7 @@ def benchmark_decode(
     )
 
     def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start = torch.Event(enable_timing=True)
         end = torch.Event(enable_timing=True)
         times = []
@@ -136,7 +136,7 @@ def benchmark_decode(
             start.record()
             fn()
             end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             times.append(start.elapsed_time(end))  # ms
         return sum(times) / len(times), torch.std(torch.tensor(times))
 
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 84bde723a..6b9d6b7f8 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -138,7 +138,7 @@ def benchmark_prefill(
     )
 
     def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start = torch.Event(enable_timing=True)
         end = torch.Event(enable_timing=True)
         times = []
@@ -148,7 +148,7 @@ def benchmark_prefill(
             start.record()
             fn()
             end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             times.append(start.elapsed_time(end))  # ms
         return sum(times) / len(times), torch.std(torch.tensor(times))
 
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 3a85c5c74..ceae12e98 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -177,18 +177,18 @@ def benchmark_config(
     def run():
         w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     # JIT complication & warmup
     for _ in range(5):
         run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     start_event = torch.Event(enable_timing=True)
     end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start_event.record()
         run()
         end_event.record()
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index 5a85526a1..4384d3e56 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -35,7 +35,7 @@ def benchmark_shape(
     B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
 
     # Reference result in BF16
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     C_ref = A @ B.t()
 
     # Pre-quantize B for all implementations
@@ -121,14 +121,14 @@ def benchmark_shape(
         # Warmup
         for _ in range(warmup):
             func()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
 
         # Timing loop
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start = time.time()
         for _ in range(repeat):
             func()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         end = time.time()
 
         # Calculate timing and TFLOPS
diff --git a/docs/design/model_runner_v2.md b/docs/design/model_runner_v2.md
index 487368420..fb40d51ee 100644
--- a/docs/design/model_runner_v2.md
+++ b/docs/design/model_runner_v2.md
@@ -50,7 +50,7 @@ V1 was not originally designed with async scheduling in mind, and support requir
 
 ## 3. Removing Async Barrier
 
-A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.cuda.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided.
+A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.accelerator.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided.
 
 However, async execution can introduce race conditions when CPU and GPU concurrently touch the same memory.
 
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index b482e131d..bced53936 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -95,7 +95,7 @@ If GPU/CPU communication cannot be established, you can use the following Python
     torch.cuda.set_device(local_rank)
     data = torch.FloatTensor([1,] * 128).to("cuda")
     dist.all_reduce(data, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     value = data.mean().item()
     world_size = dist.get_world_size()
     assert value == world_size, f"Expected {world_size}, got {value}"
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index 47dc86fa2..ea4b3a6b9 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -88,7 +88,7 @@ class RayTrainingActor:
         # Zero out all the parameters.
         for name, p in self.model.named_parameters():
             p.data.zero_()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         # The argument for `get_device_uuid` is the index of the GPU in the
         # list of visible devices.
         from vllm.platforms import current_platform
@@ -151,7 +151,7 @@ class RayTrainingActor:
                     p.data.view(-1).view(dtype=torch.uint8), non_blocking=True
                 )
                 offset += get_size(p)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             s.send_pyobj(named_tensors)
             s.recv()
         s.send_pyobj(None)
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index a515917f0..e9fc393bb 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -120,7 +120,7 @@ class ColocateWorkerExtension:
                 process_weights_after_loading(
                     self.model_runner.model, self.model_config, self.device
                 )
-                torch.cuda.synchronize()
+                torch.accelerator.synchronize()
                 socket.send(b"")
                 break
             if isinstance(payload, tuple):
@@ -144,7 +144,7 @@ class ColocateWorkerExtension:
                 weights.append((item["name"], tensor))
             self.model_runner.model.load_weights(weights=weights)
             del weights
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             socket.send(b"")
 
         socket.close()
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index 3dcc3c3df..b63a4607c 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -100,7 +100,7 @@ def test_dynamic_shapes_compilation(
     del model
     gc.collect()
     torch.accelerator.empty_cache()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     print("GPU memory cleared")
 
 
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
index 1ddce64f8..acf2e8985 100644
--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -32,7 +32,7 @@ pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
 print(f"Rank {rank} has pointers {pointers}")
 
 dist.barrier()
-torch.cuda.synchronize()
+torch.accelerator.synchronize()
 
 if rank == 0:
     # the first rank tries to write to all buffers
@@ -41,7 +41,7 @@ if rank == 0:
         lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
 
 dist.barrier()
-torch.cuda.synchronize()
+torch.accelerator.synchronize()
 
 host_data = (ctypes.c_char * buffer_size_in_bytes)()
 
@@ -59,6 +59,6 @@ for p in pointers:
 print(f"Rank {rank} verified all buffers")
 
 dist.barrier()
-torch.cuda.synchronize()
+torch.accelerator.synchronize()
 
 CustomAllreduce.free_shared_buffer(pointers)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 68abc2b98..5008c4de0 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -48,7 +48,7 @@ def graph_allreduce(
         data = torch.zeros(1)
         data = data.to(device=device)
         torch.distributed.all_reduce(data, group=group)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         del data
 
         # we use the first group to communicate once
@@ -68,7 +68,7 @@ def graph_allreduce(
                     inp2 = torch.randint(
                         1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
                     )
-                    torch.cuda.synchronize()
+                    torch.accelerator.synchronize()
                     graph = torch.cuda.CUDAGraph()
                     with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                         for i in range(num_communication):
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index d20710335..3b5b45aa0 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -68,7 +68,7 @@ def worker_fn():
     )
     tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
     tensor = pynccl_comm.all_reduce(tensor)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
 
 
@@ -93,11 +93,11 @@ def multiple_allreduce_worker_fn():
     if torch.distributed.get_rank() in [0, 1]:
         tensor = pynccl_comm.all_reduce(tensor)
         tensor = pynccl_comm.all_reduce(tensor)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         assert torch.all(tensor == 4).cpu().item()
     else:
         tensor = pynccl_comm.all_reduce(tensor)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         assert torch.all(tensor == 2).cpu().item()
 
 
@@ -121,11 +121,11 @@ def multiple_allreduce_with_vllm_worker_fn():
         if torch.distributed.get_rank() in [0, 1]:
             tensor = tensor_model_parallel_all_reduce(tensor)
             tensor = tensor_model_parallel_all_reduce(tensor)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             assert torch.all(tensor == 4).cpu().item()
         else:
             tensor = tensor_model_parallel_all_reduce(tensor)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             assert torch.all(tensor == 2).cpu().item()
 
 
@@ -147,12 +147,12 @@ def worker_fn_with_cudagraph():
         )
         # run something in the default stream to initialize torch engine
         a = torch.ones((4, 4), device=f"cuda:{pynccl_comm.rank}")
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         with torch.cuda.graph(graph):
             a_out = pynccl_comm.all_reduce(a)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         assert torch.all(a_out == pynccl_comm.world_size).cpu().item()
 
 
@@ -180,7 +180,7 @@ def all_gather_worker_fn():
     ).to(device)
 
     pynccl_comm.all_gather(result, tensor)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
@@ -215,7 +215,7 @@ def all_gatherv_worker_fn():
     ).to(device)
 
     pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
@@ -255,7 +255,7 @@ def reduce_scatter_worker_fn():
     ).to(device)
 
     pynccl_comm.reduce_scatter(result, tensor)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
@@ -293,7 +293,7 @@ def reduce_scatterv_worker_fn():
     expected = sum(tensor[start:end] for tensor in all_tensors).to(device)
 
     pynccl_comm.reduce_scatterv(result, tensor, sizes=sizes)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
@@ -325,7 +325,7 @@ def send_recv_worker_fn():
         pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
     else:
         pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     assert torch.all(tensor == 1).cpu().item()
 
 
@@ -355,7 +355,7 @@ def multiple_send_recv_worker_fn():
         pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
     else:
         pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     if torch.distributed.get_rank() in [0, 2]:
         assert torch.all(tensor == 1).cpu().item()
     else:
@@ -396,7 +396,7 @@ def broadcast_worker_fn():
         pynccl_comm.broadcast(recv_tensors[i], src=i)
         # the broadcast op might be launched in a different stream
         # need to synchronize to make sure the tensor is ready
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         assert torch.all(recv_tensors[i] == i).cpu().item()
 
 
diff --git a/tests/distributed/test_quick_all_reduce.py b/tests/distributed/test_quick_all_reduce.py
index 53d906bbc..5af3101a9 100644
--- a/tests/distributed/test_quick_all_reduce.py
+++ b/tests/distributed/test_quick_all_reduce.py
@@ -52,7 +52,7 @@ def graph_quickreduce(
         data = torch.zeros(1)
         data = data.to(device=device)
         torch.distributed.all_reduce(data, group=group)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         del data
 
         # we use the first group to communicate once
@@ -71,7 +71,7 @@ def graph_quickreduce(
                     inp2 = torch.randint(
                         -23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
                     )
-                    torch.cuda.synchronize()
+                    torch.accelerator.synchronize()
                     graph = torch.cuda.CUDAGraph()
                     with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                         for _ in range(num_communication):
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 526b6749d..c2fea7c1d 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -79,11 +79,11 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
     data = torch.tensor([rank]).cuda()
     pynccl1.all_reduce(data)
     pg1.barrier()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     if rank <= 2:
         pynccl2.all_reduce(data)
         pg2.barrier()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
     item = data[0].item()
     print(f"rank: {rank}, item: {item}")
     if rank == 3:
diff --git a/tests/distributed/test_weight_transfer.py b/tests/distributed/test_weight_transfer.py
index b370721b3..def1e1dfd 100644
--- a/tests/distributed/test_weight_transfer.py
+++ b/tests/distributed/test_weight_transfer.py
@@ -251,7 +251,7 @@ def trainer_broadcast_tensor(
     dtype = getattr(torch, tensor_dtype)
     tensor_to_send = torch.ones(tensor_shape, dtype=dtype, device="cuda:0")
     comm.broadcast(tensor_to_send, src=0, stream=torch.cuda.current_stream())
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     return True
 
@@ -309,7 +309,7 @@ def inference_receive_tensor(
         shapes=[tensor_shape],
     )
     engine.receive_weights(update_info, noop_load_weights)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Verify we received the tensor
     success = False
@@ -630,7 +630,7 @@ class TrainerActor:
         ipc_handle = reduce_tensor(self.tensor)
         gpu_uuid = get_physical_gpu_id(0)
 
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         self.ipc_handle_dict = {
             "ipc_handle": ipc_handle,
@@ -704,7 +704,7 @@ def inference_receive_ipc_tensor(
 
     update_info = engine.parse_update_info(update_dict)
     engine.receive_weights(update_info, noop_load_weights)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Verify we received the tensor
     success = False
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index a9f525cdc..6fccb8ccf 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -165,7 +165,7 @@ def test_merge_attn_states(
             suffix_lse_torch,
             output_lse_torch,
         )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     for _ in range(repeat_times):
         start.record()
@@ -178,7 +178,7 @@ def test_merge_attn_states(
             output_lse_torch,
         )
         end.record()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         total_time_torch_kernel += start.elapsed_time(end)
 
     avg_time_torch_kernel = total_time_torch_kernel / repeat_times
@@ -200,7 +200,7 @@ def test_merge_attn_states(
             suffix_lse,
             output_lse_ref_triton,
         )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     for _ in range(repeat_times):
         start.record()
@@ -213,7 +213,7 @@ def test_merge_attn_states(
             output_lse_ref_triton,
         )
         end.record()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         total_time_triton_kernel += start.elapsed_time(end)
 
     avg_time_triton_kernel = total_time_triton_kernel / repeat_times
@@ -232,7 +232,7 @@ def test_merge_attn_states(
             suffix_lse,
             output_lse_cuda,
         )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     for _ in range(repeat_times):
         start.record()
@@ -245,7 +245,7 @@ def test_merge_attn_states(
             output_lse_cuda,
         )
         end.record()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         total_time_cuda_kernel += start.elapsed_time(end)
 
     avg_time_cuda_kernel = total_time_cuda_kernel / repeat_times
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 2dc4a3cd2..7aeeaf8b4 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -239,7 +239,7 @@ def test_contexted_kv_attention(
         v_scale,
         sliding_window=sliding_window,
     )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     start_time = time.time()
     op(
         query,
@@ -258,7 +258,7 @@ def test_contexted_kv_attention(
         v_scale,
         sliding_window=sliding_window,
     )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     end_time = time.time()
     print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
 
@@ -298,7 +298,7 @@ def test_contexted_kv_attention(
         dropout_p=0.0,
         scale=scale,
     )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     start_time = time.time()
     output_ref = F.scaled_dot_product_attention(
         query_sdpa,
@@ -308,7 +308,7 @@ def test_contexted_kv_attention(
         dropout_p=0.0,
         scale=scale,
     )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     end_time = time.time()
     print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
 
@@ -482,7 +482,7 @@ def test_contexted_kv_attention_alibi(
         v_scale,
         alibi_slopes=alibi_slopes,
     )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     start_time = time.time()
     op(
         query,
@@ -501,7 +501,7 @@ def test_contexted_kv_attention_alibi(
         v_scale,
         alibi_slopes=alibi_slopes,
     )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     end_time = time.time()
     print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
     scale = float(1.0 / (head_size**0.5))
@@ -517,7 +517,7 @@ def test_contexted_kv_attention_alibi(
 
     output_ref = torch.empty_like(output)
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     start_time = time.time()
 
     query_start = 0
@@ -572,7 +572,7 @@ def test_contexted_kv_attention_alibi(
         query_start = query_end
         key_start = key_end
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     end_time = time.time()
     print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
index 416395e59..2dca0da07 100644
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -127,7 +127,7 @@ def test_fused_rms_norm_quant(
             out_quant, x_unfused.contiguous(), quant_scale_t
         )
 
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
         opcheck(
             torch.ops._C.fused_add_rms_norm_static_fp8_quant,
diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
index 2554c4fce..9f0f9f2ea 100644
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -34,7 +34,7 @@ def do_profile(
         record_shapes=True,
     ) as tprof:
         fn(**fn_kwargs)
-        torch.cuda.synchronize(torch.cuda.current_device())
+        torch.accelerator.synchronize(torch.cuda.current_device())
 
     # TODO (varun): Add a descriptive trace file name
     tprof.export_chrome_trace(
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index a74e739c5..7011786f2 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -318,8 +318,8 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch)
                 out = deep_gemm_moe_fp8_fn(
                     a, w1, w2, w1_s, w2_s, topk_weights, topk_ids
                 )
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
 
     torch.testing.assert_close(out, ref_out, atol=0.035, rtol=0.035)
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 1ec2c614c..c1cf8b2d3 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -399,9 +399,9 @@ def test_cutlass_moe_8_bit_cuda_graph(
                 mt, topk_weights, topk_ids, per_act_token, per_out_ch
             )
 
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
         torch.testing.assert_close(triton_output, cutlass_output, atol=9e-2, rtol=1e-2)
 
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index f8e2a8b52..43bdd03cf 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -272,9 +272,9 @@ def run_moe_test(
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
             )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     torch.testing.assert_close(test_output, baseline_output, atol=atol, rtol=rtol)
 
@@ -768,7 +768,7 @@ def test_mixtral_moe(
                 F.pad(vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128],
                 requires_grad=False,
             )
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             torch.accelerator.empty_cache()
 
         # FIXME (zyongye) fix this after we move self.kernel
diff --git a/tests/kernels/quantization/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
index 7f6adbd52..b6272557c 100644
--- a/tests/kernels/quantization/test_allspark_gemm.py
+++ b/tests/kernels/quantization/test_allspark_gemm.py
@@ -122,7 +122,7 @@ def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
     )
 
     output_ref = torch.matmul(input, w_ref)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     max_diff = compute_max_diff(output, output_ref)
 
     assert max_diff < 0.04
diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
index de0e347d8..5e6c170db 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
@@ -269,7 +269,7 @@ def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
         setup.c_strides,
         setup.group_scale_strides,
     )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     out_ref = compute_moe_reference_output(setup)
     torch.testing.assert_close(setup.out, out_ref, rtol=1e-2, atol=1e-2)
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 3453753ec..f918212f7 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -260,7 +260,7 @@ def test_gptq_marlin_repack(
     marlin_q_w_2 = ops.gptq_marlin_repack(
         q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits, is_a_8bit
     )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
 
@@ -308,7 +308,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, is_a_8bit, nk_factors):
     marlin_q_w_2 = ops.awq_marlin_repack(
         q_w_awq, size_k, size_n, quant_type.size_bits, is_a_8bit
     )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
 
@@ -564,7 +564,7 @@ def test_marlin_gemm_subset_input():
     )
     output_ref = torch.matmul(a_input, w_ref)
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     max_diff = compute_max_diff(output, output_ref)
 
@@ -613,7 +613,7 @@ def test_marlin_gemm_with_bias(size_m):
     )
     output_ref = torch.matmul(a_input, w_ref) + b_bias.view(1, -1)
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     max_diff = compute_max_diff(output, output_ref)
 
diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py
index b5d66b4ed..4cc8e3b14 100644
--- a/tests/kernels/test_cache_kernels.py
+++ b/tests/kernels/test_cache_kernels.py
@@ -57,7 +57,7 @@ def test_gather_cache_oob():
         seq_starts,
     )
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
     assert True
 
 
diff --git a/tests/kernels/test_top_k_per_row.py b/tests/kernels/test_top_k_per_row.py
index 9b96e6dfc..f4bfc1666 100644
--- a/tests/kernels/test_top_k_per_row.py
+++ b/tests/kernels/test_top_k_per_row.py
@@ -219,7 +219,7 @@ def _run_top_k_per_row_decode_test(
         top_k,
     )
 
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     # Run reference implementation
     torch_indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda")
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 1425bb044..b43ac453a 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -195,4 +195,4 @@ def test_models(
         # unit tests. On ROCm, when using AITER
         # the memory might not be deallocated completely
         # before running the next test case
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index e5a047a7c..9d31a3f87 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -196,7 +196,7 @@ def test_compressed_tensors_w8a8_logprobs(
     )
 
     if current_platform.is_rocm():
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py
index f2e3cbf26..356650863 100644
--- a/tools/pre_commit/check_torch_cuda.py
+++ b/tools/pre_commit/check_torch_cuda.py
@@ -9,6 +9,7 @@ import regex as re
 # --------------------------------------------------------------------------- #
 _TORCH_CUDA_PATTERNS = [
     r"\btorch\.cuda\.empty_cache\b",
+    r"\btorch\.cuda\.synchronize\b",
 ]
 
 ALLOWED_FILES = {"vllm/platforms/", "vllm/device_allocator/"}
diff --git a/vllm/distributed/elastic_ep/elastic_execute.py b/vllm/distributed/elastic_ep/elastic_execute.py
index f32ea39fb..516d2c256 100644
--- a/vllm/distributed/elastic_ep/elastic_execute.py
+++ b/vllm/distributed/elastic_ep/elastic_execute.py
@@ -217,7 +217,7 @@ class ElasticEPScalingExecutor:
                 dp_group=standby_dp_group,
                 expert_weights=model.expert_weights,
             )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     def broadcast_expert_mapping(self) -> None:
         standby_dp_group = get_standby_dp_group()
@@ -407,7 +407,7 @@ class ElasticEPScalingExecutor:
             reset_compile_wrapper(self.worker.model_runner.get_model())
 
         gc.collect()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         torch.accelerator.empty_cache()
         unlock_workspace()
         self.worker.compile_or_warm_up_model()
@@ -446,7 +446,7 @@ class ElasticEPScalingExecutor:
 
             eplb_state.rearrange(rank_mapping=rank_mapping)
         # NOTE(yongji): check whether we need to synchronize here
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         # reset expert_rearrangement_step to ensure all ranks are synchronized
         eplb_state.expert_rearrangement_step = 0
         eplb_state.num_valid_physical_experts = (
@@ -491,7 +491,7 @@ class ElasticEPScalingExecutor:
             dp_group=dp_group,
             expert_weights=model.expert_weights,
         )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     def receive_expert_mapping(self) -> tuple[torch.Tensor, int, int]:
         dp_group = get_dp_group()
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 777f9c553..7823ce4a3 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -622,7 +622,7 @@ def rearrange_expert_weights_inplace(
 
     # NOTE(bowen): We need this synchronize to run, but I don't know why.
     # If you figure out the reason, please let me know -- thank you!
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
 
     old_global_expert_indices_cpu = old_global_expert_indices.cpu().numpy()
     new_global_expert_indices_cpu = new_global_expert_indices.cpu().numpy()
diff --git a/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py b/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py
index 553f3cb04..184a7f71d 100644
--- a/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py
+++ b/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py
@@ -77,7 +77,7 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
             convert_packed_uint4b8_to_signed_int4_inplace(x.data)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
             x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t())
             return x
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index db158e4fe..8cb65c4d2 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -457,7 +457,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             else:
                 self._dummy_pooler_run(hidden_states)
 
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         del hidden_states, sample_hidden_states
         gc.collect()
 
@@ -525,7 +525,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # to trigger JIT compilation.
         if all("FLASHINFER" in b.get_name() for b in self.attn_backends.values()):
             self._dummy_run(self.max_num_tokens, skip_attn=False)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
 
     def finish_requests(self, scheduler_output: SchedulerOutput) -> None:
         finished_req_ids = scheduler_output.finished_req_ids
diff --git a/vllm/v1/worker/gpu/warmup.py b/vllm/v1/worker/gpu/warmup.py
index ffe5b33f7..9d70a56f5 100644
--- a/vllm/v1/worker/gpu/warmup.py
+++ b/vllm/v1/worker/gpu/warmup.py
@@ -102,4 +102,4 @@ def warmup_kernels(model_runner: GPUModelRunner) -> None:
     cleanup_output.finished_req_ids = set(req_ids)
     model_runner.execute_model(cleanup_output)
     model_runner.kv_connector.set_disabled(False)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 29fe9ec83..29a5e46ab 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -928,7 +928,7 @@ class GPUModelRunner(
 
     # Note: used for model runner override.
     def _sync_device(self) -> None:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         """Update the cached states and the persistent batch with the scheduler
@@ -5345,7 +5345,7 @@ class GPUModelRunner(
                     cudagraph_runtime_mode=runtime_mode,
                 )
 
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             end_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         # Disable cudagraph capturing globally, so any unexpected cudagraph
@@ -6266,13 +6266,13 @@ class GPUModelRunner(
         group_refs = group_lora_refs[current_item_idx : current_item_idx + num_items]
         group_request_ids = {req_id for req_id, _ in group_refs}
 
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
         start_time = time.perf_counter()
 
         try:
             yield
         finally:
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
             elapsed = time.perf_counter() - start_time
 
             per_request_time = elapsed / max(len(group_request_ids), 1)
diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
index 540c9cb20..ddefa7495 100644
--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -29,9 +29,6 @@ class XPUModelRunner(GPUModelRunner):
         # FIXME: To be verified.
         self.cascade_attn_enabled = False
 
-    def _sync_device(self) -> None:
-        torch.xpu.synchronize()
-
 
 @contextmanager
 def _torch_cuda_wrapper():
@@ -42,7 +39,6 @@ def _torch_cuda_wrapper():
         torch.cuda.current_stream = torch.xpu.current_stream
         torch.cuda.stream = torch.xpu.stream
         torch.cuda.mem_get_info = torch.xpu.mem_get_info
-        torch.cuda.synchronize = torch.xpu.synchronize
         if supports_xpu_graph():
             torch.cuda.graph = torch.xpu.graph
             torch.cuda.CUDAGraph = torch.xpu.XPUGraph