diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py index ffcfa4572..867f55fa9 100644 --- a/benchmarks/attention_benchmarks/mla_runner.py +++ b/benchmarks/attention_benchmarks/mla_runner.py @@ -701,7 +701,7 @@ def _run_single_benchmark( # Warmup for _ in range(config.warmup_iters): forward_fn() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Benchmark times = [] @@ -714,7 +714,7 @@ def _run_single_benchmark( forward_fn() end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() elapsed_ms = start.elapsed_time(end) times.append(elapsed_ms / 1000.0 / config.num_layers) diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py index 6457a599a..9744b857d 100644 --- a/benchmarks/attention_benchmarks/runner.py +++ b/benchmarks/attention_benchmarks/runner.py @@ -391,7 +391,7 @@ def _run_single_benchmark( attn_metadata, output=out, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Benchmark times = [] @@ -412,7 +412,7 @@ def _run_single_benchmark( ) end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() elapsed_ms = start.elapsed_time(end) times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer diff --git a/benchmarks/benchmark_topk_topp.py b/benchmarks/benchmark_topk_topp.py index aa020e012..f1d59cbde 100644 --- a/benchmarks/benchmark_topk_topp.py +++ b/benchmarks/benchmark_topk_topp.py @@ -94,7 +94,7 @@ def create_logits( def measure_memory() -> tuple[int, int]: """Return (allocated, reserved) memory in bytes.""" - torch.cuda.synchronize() + torch.accelerator.synchronize() return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated() @@ -123,7 +123,7 @@ def benchmark_function( for _ in range(warmup_iters): logits_copy = logits.clone() func(logits_copy, k, p) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Reset memory stats before benchmark reset_memory_stats() @@ -140,7 +140,7 @@ def benchmark_function( func(logits_copy, k, p) end_events[i].record() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Calculate timing times = [ diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py index 04921dafb..8aaf82197 100644 --- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py @@ -168,7 +168,7 @@ def bench_impl( # warmup for kwargs in kwargs_list: impl_type.get_impl()(**kwargs) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Merge into a single kwargs and qualify arguments as ArgPool kwargs = {k: ArgPool([]) for k in kwargs_list[0]} diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index bd116e36a..58ccfcc45 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -171,7 +171,7 @@ def bench_run( activation=MoEActivation.SILU, global_num_experts=num_experts, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly) triton_stream = torch.cuda.Stream() @@ -187,14 +187,14 @@ def bench_run( topk_ids, quant_config=quant_config, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() def bench_cuda_graph(graph, num_warmup=5, num_iters=100): """Benchmark CUDA graph using events like benchmark_moe.py""" # Warmup for _ in range(num_warmup): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Timing start_event = torch.Event(enable_timing=True) @@ -202,7 +202,7 @@ def bench_run( latencies = [] for _ in range(num_iters): - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() graph.replay() end_event.record() diff --git a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py index cfb1489da..2d4afd38c 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py @@ -307,7 +307,7 @@ def bench_run( def replay_graph(graph, num_repeats): for _ in range(num_repeats): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() cutlass_stream = torch.cuda.Stream() cutlass_graph = torch.cuda.CUDAGraph() @@ -330,7 +330,7 @@ def bench_run( e=num_experts, device=device, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() triton_stream = torch.cuda.Stream() triton_graph = torch.cuda.CUDAGraph() @@ -345,7 +345,7 @@ def bench_run( w2_fp8scale, a_fp8_scale, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() min_run_time = 5 num_warmup = 5 diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index d1005461a..9b5ccac4e 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -342,7 +342,7 @@ class CommunicatorBenchmark: if not should_use_fn(tensor): return None - torch.cuda.synchronize() + torch.accelerator.synchronize() stream = torch.cuda.Stream() with torch.cuda.stream(stream): graph_input = tensor.clone() @@ -360,17 +360,17 @@ class CommunicatorBenchmark: for _ in range(CUDA_GRAPH_CAPTURE_CYCLES): allreduce_fn(graph_input) - torch.cuda.synchronize() + torch.accelerator.synchronize() for _ in range(num_warmup): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.perf_counter() for _ in range(num_trials): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py index e18f6a758..2547f553f 100644 --- a/benchmarks/kernels/benchmark_fused_collective.py +++ b/benchmarks/kernels/benchmark_fused_collective.py @@ -385,7 +385,7 @@ def benchmark_operation( # Warmup before graph capture for _ in range(warmup): operation_func(*args, **kwargs) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Create CUDA graph graph = torch.cuda.CUDAGraph() @@ -398,19 +398,19 @@ def benchmark_operation( operation_func(*args, **kwargs) # Graph warmup - torch.cuda.synchronize() + torch.accelerator.synchronize() for _ in range(warmup): graph.replay() # Benchmark with CUDA graph - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.perf_counter() for _ in range(trials // num_op_per_cudagraph): # operation_func(*args, **kwargs) graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() avg_time_ms = ((end_time - start_time) / trials) * 1000 diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 60ec94b87..039eb2f29 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -224,7 +224,7 @@ def bench_run( def replay_graph(graph, num_repeats): for _ in range(num_repeats): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() cutlass_stream = torch.cuda.Stream() cutlass_graph = torch.cuda.CUDAGraph() @@ -239,7 +239,7 @@ def bench_run( topk_weights, topk_ids, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() triton_stream = torch.cuda.Stream() triton_graph = torch.cuda.CUDAGraph() @@ -254,7 +254,7 @@ def bench_run( w2_scale, a_scale, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() min_run_time = 5 num_warmup = 5 diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index cc1c1cf09..a662e3ac4 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -34,14 +34,14 @@ def main( residual = torch.randn_like(x) * scale if add_residual else None def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() + torch.accelerator.synchronize() if profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() for _ in range(num_iters): layer(x, residual) - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() if profile: diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 8ca3cf78f..ab930c59d 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -1035,7 +1035,7 @@ def bench_optype( # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up for kwargs in kwargs_list: op_type.bench_fn()(**kwargs) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Merge into a single kwargs and qualify arguments as ArgPool kwargs = {k: ArgPool([]) for k in kwargs_list[0]} diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py index fb3b6c8f1..7debf3634 100644 --- a/benchmarks/kernels/benchmark_mla_k_concat.py +++ b/benchmarks/kernels/benchmark_mla_k_concat.py @@ -47,13 +47,13 @@ def benchmark_method( # Warmup for _ in range(num_warmup): _ = method(k_nope, k_pe) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Benchmark start = time.perf_counter() for _ in range(num_iters): _ = method(k_nope, k_pe) - torch.cuda.synchronize() + torch.accelerator.synchronize() end = time.perf_counter() return (end - start) / num_iters * 1000 # Convert to ms diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 3bd3e3f67..9ef825417 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -304,19 +304,19 @@ def benchmark_config( # JIT compilation & warmup run() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Capture 10 invocations with CUDA graph graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph): for _ in range(10): run() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Warmup for _ in range(5): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) @@ -324,7 +324,7 @@ def benchmark_config( latencies: list[float] = [] for i in range(num_iters): prepare(i) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() graph.replay() diff --git a/benchmarks/kernels/benchmark_moe_defaults.py b/benchmarks/kernels/benchmark_moe_defaults.py index 9527878bc..f6ad59366 100644 --- a/benchmarks/kernels/benchmark_moe_defaults.py +++ b/benchmarks/kernels/benchmark_moe_defaults.py @@ -131,7 +131,7 @@ def benchmark_config( topk_ids, quant_config=quant_config, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Benchmark start = torch.cuda.Event(enable_timing=True) @@ -149,7 +149,7 @@ def benchmark_config( quant_config=quant_config, ) end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() return start.elapsed_time(end) / num_iters * 1000 # ms -> us diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index d9a1d3303..990be5932 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -69,19 +69,19 @@ def benchmark_permute( # JIT compilation & warmup run() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Capture 10 invocations with CUDA graph graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph): for _ in range(10): run() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Warmup for _ in range(5): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) @@ -89,7 +89,7 @@ def benchmark_permute( latencies: list[float] = [] for i in range(num_iters): prepare(i) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() graph.replay() @@ -159,26 +159,26 @@ def benchmark_unpermute( # JIT compilation & warmup input = prepare() run(input) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Capture 10 invocations with CUDA graph graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph): for _ in range(10): run(input) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Warmup for _ in range(5): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() graph.replay() end_event.record() diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 2c086870c..6548c74f8 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -135,14 +135,14 @@ def benchmark_mrope( key.clone(), ) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Time reference implementation torch_times = [] for _ in range(benchmark_iter): query_clone = query.clone() key_clone = key.clone() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.time() mrope_helper_class.forward_native( @@ -151,7 +151,7 @@ def benchmark_mrope( key_clone, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch_times.append(time.time() - start_time) # Time triton kernel implementation @@ -159,14 +159,14 @@ def benchmark_mrope( for _ in range(benchmark_iter): query_clone = query.clone() key_clone = key.clone() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.time() mrope_helper_class.forward_cuda( positions, query_clone, key_clone, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() triton_times.append(time.time() - start_time) # Calculate statistics diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index be871d3d1..b6a0b7ad8 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -103,7 +103,7 @@ def main( max_logits = torch.empty_like(exp_sums) def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() + torch.accelerator.synchronize() if profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() @@ -173,7 +173,7 @@ def main( ) else: raise ValueError(f"Invalid version: {version}") - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() if profile: diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py index eba4d5102..f2195a6d7 100644 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -28,7 +28,7 @@ def _time_cuda( # warmup for _ in range(warmup_iters): fn() - torch.cuda.synchronize() + torch.accelerator.synchronize() start = torch.Event(enable_timing=True) end = torch.Event(enable_timing=True) @@ -37,7 +37,7 @@ def _time_cuda( for _ in range(bench_iters): fn() end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() return start.elapsed_time(end) / bench_iters # ms/iter diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 9a21cfe94..d01c7ac37 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -29,7 +29,7 @@ def main( scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() + torch.accelerator.synchronize() if profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() @@ -39,7 +39,7 @@ def main( ops.scaled_int8_quant(x, scale) else: ops.scaled_fp8_quant(x, scale) - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() if profile: diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py index b4c949e4f..97af4ac97 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -84,16 +84,16 @@ def run_benchmark( g = torch.cuda.CUDAGraph() with torch.cuda.graph(g): function_under_test() - torch.cuda.synchronize() + torch.accelerator.synchronize() function_under_test = lambda: g.replay() def run_cuda_benchmark(n_iters: int) -> float: nonlocal key, value, key_cache, value_cache, slot_mapping - torch.cuda.synchronize() + torch.accelerator.synchronize() start = time.perf_counter() for _ in range(n_iters): function_under_test() - torch.cuda.synchronize() + torch.accelerator.synchronize() end = time.perf_counter() return (end - start) / n_iters diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index 2a250620b..55c203725 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -109,16 +109,16 @@ def run_benchmark( g = torch.cuda.CUDAGraph() with torch.cuda.graph(g): function_under_test() - torch.cuda.synchronize() + torch.accelerator.synchronize() function_under_test = lambda: g.replay() def run_cuda_benchmark(n_iters: int) -> float: nonlocal key, value, key_cache, value_cache, slot_mapping - torch.cuda.synchronize() + torch.accelerator.synchronize() start = time.perf_counter() for _ in range(n_iters): function_under_test() - torch.cuda.synchronize() + torch.accelerator.synchronize() end = time.perf_counter() return (end - start) / n_iters diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index da32bc30c..13b97b769 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -251,7 +251,7 @@ def benchmark( kernel( y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G ) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) @@ -259,7 +259,7 @@ def benchmark( # Benchmark latencies: list[float] = [] for _ in range(runs): - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() for i in range(iterations_per_run): diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 1d0d6fbb9..89970e2b0 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -126,7 +126,7 @@ def benchmark_decode( ) def time_fn(fn, warmup=10, trials=20): - torch.cuda.synchronize() + torch.accelerator.synchronize() start = torch.Event(enable_timing=True) end = torch.Event(enable_timing=True) times = [] @@ -136,7 +136,7 @@ def benchmark_decode( start.record() fn() end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() times.append(start.elapsed_time(end)) # ms return sum(times) / len(times), torch.std(torch.tensor(times)) diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 84bde723a..6b9d6b7f8 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -138,7 +138,7 @@ def benchmark_prefill( ) def time_fn(fn, warmup=10, trials=20): - torch.cuda.synchronize() + torch.accelerator.synchronize() start = torch.Event(enable_timing=True) end = torch.Event(enable_timing=True) times = [] @@ -148,7 +148,7 @@ def benchmark_prefill( start.record() fn() end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() times.append(start.elapsed_time(end)) # ms return sum(times) / len(times), torch.std(torch.tensor(times)) diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 3a85c5c74..ceae12e98 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -177,18 +177,18 @@ def benchmark_config( def run(): w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype) - torch.cuda.synchronize() + torch.accelerator.synchronize() # JIT complication & warmup for _ in range(5): run() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() run() end_event.record() diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index 5a85526a1..4384d3e56 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -35,7 +35,7 @@ def benchmark_shape( B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16) # Reference result in BF16 - torch.cuda.synchronize() + torch.accelerator.synchronize() C_ref = A @ B.t() # Pre-quantize B for all implementations @@ -121,14 +121,14 @@ def benchmark_shape( # Warmup for _ in range(warmup): func() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Timing loop - torch.cuda.synchronize() + torch.accelerator.synchronize() start = time.time() for _ in range(repeat): func() - torch.cuda.synchronize() + torch.accelerator.synchronize() end = time.time() # Calculate timing and TFLOPS diff --git a/docs/design/model_runner_v2.md b/docs/design/model_runner_v2.md index 487368420..fb40d51ee 100644 --- a/docs/design/model_runner_v2.md +++ b/docs/design/model_runner_v2.md @@ -50,7 +50,7 @@ V1 was not originally designed with async scheduling in mind, and support requir ## 3. Removing Async Barrier -A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.cuda.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided. +A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.accelerator.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided. However, async execution can introduce race conditions when CPU and GPU concurrently touch the same memory. diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index b482e131d..bced53936 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -95,7 +95,7 @@ If GPU/CPU communication cannot be established, you can use the following Python torch.cuda.set_device(local_rank) data = torch.FloatTensor([1,] * 128).to("cuda") dist.all_reduce(data, op=dist.ReduceOp.SUM) - torch.cuda.synchronize() + torch.accelerator.synchronize() value = data.mean().item() world_size = dist.get_world_size() assert value == world_size, f"Expected {world_size}, got {value}" diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py index 47dc86fa2..ea4b3a6b9 100644 --- a/examples/offline_inference/rlhf_colocate.py +++ b/examples/offline_inference/rlhf_colocate.py @@ -88,7 +88,7 @@ class RayTrainingActor: # Zero out all the parameters. for name, p in self.model.named_parameters(): p.data.zero_() - torch.cuda.synchronize() + torch.accelerator.synchronize() # The argument for `get_device_uuid` is the index of the GPU in the # list of visible devices. from vllm.platforms import current_platform @@ -151,7 +151,7 @@ class RayTrainingActor: p.data.view(-1).view(dtype=torch.uint8), non_blocking=True ) offset += get_size(p) - torch.cuda.synchronize() + torch.accelerator.synchronize() s.send_pyobj(named_tensors) s.recv() s.send_pyobj(None) diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index a515917f0..e9fc393bb 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -120,7 +120,7 @@ class ColocateWorkerExtension: process_weights_after_loading( self.model_runner.model, self.model_config, self.device ) - torch.cuda.synchronize() + torch.accelerator.synchronize() socket.send(b"") break if isinstance(payload, tuple): @@ -144,7 +144,7 @@ class ColocateWorkerExtension: weights.append((item["name"], tensor)) self.model_runner.model.load_weights(weights=weights) del weights - torch.cuda.synchronize() + torch.accelerator.synchronize() socket.send(b"") socket.close() diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py index 3dcc3c3df..b63a4607c 100644 --- a/tests/compile/test_dynamic_shapes_compilation.py +++ b/tests/compile/test_dynamic_shapes_compilation.py @@ -100,7 +100,7 @@ def test_dynamic_shapes_compilation( del model gc.collect() torch.accelerator.empty_cache() - torch.cuda.synchronize() + torch.accelerator.synchronize() print("GPU memory cleared") diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py index 1ddce64f8..acf2e8985 100644 --- a/tests/distributed/test_ca_buffer_sharing.py +++ b/tests/distributed/test_ca_buffer_sharing.py @@ -32,7 +32,7 @@ pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes) print(f"Rank {rank} has pointers {pointers}") dist.barrier() -torch.cuda.synchronize() +torch.accelerator.synchronize() if rank == 0: # the first rank tries to write to all buffers @@ -41,7 +41,7 @@ if rank == 0: lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes) dist.barrier() -torch.cuda.synchronize() +torch.accelerator.synchronize() host_data = (ctypes.c_char * buffer_size_in_bytes)() @@ -59,6 +59,6 @@ for p in pointers: print(f"Rank {rank} verified all buffers") dist.barrier() -torch.cuda.synchronize() +torch.accelerator.synchronize() CustomAllreduce.free_shared_buffer(pointers) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 68abc2b98..5008c4de0 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -48,7 +48,7 @@ def graph_allreduce( data = torch.zeros(1) data = data.to(device=device) torch.distributed.all_reduce(data, group=group) - torch.cuda.synchronize() + torch.accelerator.synchronize() del data # we use the first group to communicate once @@ -68,7 +68,7 @@ def graph_allreduce( inp2 = torch.randint( 1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device() ) - torch.cuda.synchronize() + torch.accelerator.synchronize() graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph, stream=graph_capture_context.stream): for i in range(num_communication): diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index d20710335..3b5b45aa0 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -68,7 +68,7 @@ def worker_fn(): ) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() + torch.accelerator.synchronize() assert torch.all(tensor == pynccl_comm.world_size).cpu().item() @@ -93,11 +93,11 @@ def multiple_allreduce_worker_fn(): if torch.distributed.get_rank() in [0, 1]: tensor = pynccl_comm.all_reduce(tensor) tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() + torch.accelerator.synchronize() assert torch.all(tensor == 4).cpu().item() else: tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() + torch.accelerator.synchronize() assert torch.all(tensor == 2).cpu().item() @@ -121,11 +121,11 @@ def multiple_allreduce_with_vllm_worker_fn(): if torch.distributed.get_rank() in [0, 1]: tensor = tensor_model_parallel_all_reduce(tensor) tensor = tensor_model_parallel_all_reduce(tensor) - torch.cuda.synchronize() + torch.accelerator.synchronize() assert torch.all(tensor == 4).cpu().item() else: tensor = tensor_model_parallel_all_reduce(tensor) - torch.cuda.synchronize() + torch.accelerator.synchronize() assert torch.all(tensor == 2).cpu().item() @@ -147,12 +147,12 @@ def worker_fn_with_cudagraph(): ) # run something in the default stream to initialize torch engine a = torch.ones((4, 4), device=f"cuda:{pynccl_comm.rank}") - torch.cuda.synchronize() + torch.accelerator.synchronize() with torch.cuda.graph(graph): a_out = pynccl_comm.all_reduce(a) - torch.cuda.synchronize() + torch.accelerator.synchronize() graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() assert torch.all(a_out == pynccl_comm.world_size).cpu().item() @@ -180,7 +180,7 @@ def all_gather_worker_fn(): ).to(device) pynccl_comm.all_gather(result, tensor) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -215,7 +215,7 @@ def all_gatherv_worker_fn(): ).to(device) pynccl_comm.all_gatherv(result, tensor, sizes=sizes) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -255,7 +255,7 @@ def reduce_scatter_worker_fn(): ).to(device) pynccl_comm.reduce_scatter(result, tensor) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -293,7 +293,7 @@ def reduce_scatterv_worker_fn(): expected = sum(tensor[start:end] for tensor in all_tensors).to(device) pynccl_comm.reduce_scatterv(result, tensor, sizes=sizes) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -325,7 +325,7 @@ def send_recv_worker_fn(): pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) else: pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) - torch.cuda.synchronize() + torch.accelerator.synchronize() assert torch.all(tensor == 1).cpu().item() @@ -355,7 +355,7 @@ def multiple_send_recv_worker_fn(): pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) else: pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) - torch.cuda.synchronize() + torch.accelerator.synchronize() if torch.distributed.get_rank() in [0, 2]: assert torch.all(tensor == 1).cpu().item() else: @@ -396,7 +396,7 @@ def broadcast_worker_fn(): pynccl_comm.broadcast(recv_tensors[i], src=i) # the broadcast op might be launched in a different stream # need to synchronize to make sure the tensor is ready - torch.cuda.synchronize() + torch.accelerator.synchronize() assert torch.all(recv_tensors[i] == i).cpu().item() diff --git a/tests/distributed/test_quick_all_reduce.py b/tests/distributed/test_quick_all_reduce.py index 53d906bbc..5af3101a9 100644 --- a/tests/distributed/test_quick_all_reduce.py +++ b/tests/distributed/test_quick_all_reduce.py @@ -52,7 +52,7 @@ def graph_quickreduce( data = torch.zeros(1) data = data.to(device=device) torch.distributed.all_reduce(data, group=group) - torch.cuda.synchronize() + torch.accelerator.synchronize() del data # we use the first group to communicate once @@ -71,7 +71,7 @@ def graph_quickreduce( inp2 = torch.randint( -23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device() ) - torch.cuda.synchronize() + torch.accelerator.synchronize() graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph, stream=graph_capture_context.stream): for _ in range(num_communication): diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 526b6749d..c2fea7c1d 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -79,11 +79,11 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2): data = torch.tensor([rank]).cuda() pynccl1.all_reduce(data) pg1.barrier() - torch.cuda.synchronize() + torch.accelerator.synchronize() if rank <= 2: pynccl2.all_reduce(data) pg2.barrier() - torch.cuda.synchronize() + torch.accelerator.synchronize() item = data[0].item() print(f"rank: {rank}, item: {item}") if rank == 3: diff --git a/tests/distributed/test_weight_transfer.py b/tests/distributed/test_weight_transfer.py index b370721b3..def1e1dfd 100644 --- a/tests/distributed/test_weight_transfer.py +++ b/tests/distributed/test_weight_transfer.py @@ -251,7 +251,7 @@ def trainer_broadcast_tensor( dtype = getattr(torch, tensor_dtype) tensor_to_send = torch.ones(tensor_shape, dtype=dtype, device="cuda:0") comm.broadcast(tensor_to_send, src=0, stream=torch.cuda.current_stream()) - torch.cuda.synchronize() + torch.accelerator.synchronize() return True @@ -309,7 +309,7 @@ def inference_receive_tensor( shapes=[tensor_shape], ) engine.receive_weights(update_info, noop_load_weights) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Verify we received the tensor success = False @@ -630,7 +630,7 @@ class TrainerActor: ipc_handle = reduce_tensor(self.tensor) gpu_uuid = get_physical_gpu_id(0) - torch.cuda.synchronize() + torch.accelerator.synchronize() self.ipc_handle_dict = { "ipc_handle": ipc_handle, @@ -704,7 +704,7 @@ def inference_receive_ipc_tensor( update_info = engine.parse_update_info(update_dict) engine.receive_weights(update_info, noop_load_weights) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Verify we received the tensor success = False diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py index a9f525cdc..6fccb8ccf 100644 --- a/tests/kernels/attention/test_merge_attn_states.py +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -165,7 +165,7 @@ def test_merge_attn_states( suffix_lse_torch, output_lse_torch, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() for _ in range(repeat_times): start.record() @@ -178,7 +178,7 @@ def test_merge_attn_states( output_lse_torch, ) end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() total_time_torch_kernel += start.elapsed_time(end) avg_time_torch_kernel = total_time_torch_kernel / repeat_times @@ -200,7 +200,7 @@ def test_merge_attn_states( suffix_lse, output_lse_ref_triton, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() for _ in range(repeat_times): start.record() @@ -213,7 +213,7 @@ def test_merge_attn_states( output_lse_ref_triton, ) end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() total_time_triton_kernel += start.elapsed_time(end) avg_time_triton_kernel = total_time_triton_kernel / repeat_times @@ -232,7 +232,7 @@ def test_merge_attn_states( suffix_lse, output_lse_cuda, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() for _ in range(repeat_times): start.record() @@ -245,7 +245,7 @@ def test_merge_attn_states( output_lse_cuda, ) end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() total_time_cuda_kernel += start.elapsed_time(end) avg_time_cuda_kernel = total_time_cuda_kernel / repeat_times diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 2dc4a3cd2..7aeeaf8b4 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -239,7 +239,7 @@ def test_contexted_kv_attention( v_scale, sliding_window=sliding_window, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.time() op( query, @@ -258,7 +258,7 @@ def test_contexted_kv_attention( v_scale, sliding_window=sliding_window, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.time() print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms") @@ -298,7 +298,7 @@ def test_contexted_kv_attention( dropout_p=0.0, scale=scale, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.time() output_ref = F.scaled_dot_product_attention( query_sdpa, @@ -308,7 +308,7 @@ def test_contexted_kv_attention( dropout_p=0.0, scale=scale, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.time() print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms") @@ -482,7 +482,7 @@ def test_contexted_kv_attention_alibi( v_scale, alibi_slopes=alibi_slopes, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.time() op( query, @@ -501,7 +501,7 @@ def test_contexted_kv_attention_alibi( v_scale, alibi_slopes=alibi_slopes, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.time() print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms") scale = float(1.0 / (head_size**0.5)) @@ -517,7 +517,7 @@ def test_contexted_kv_attention_alibi( output_ref = torch.empty_like(output) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.time() query_start = 0 @@ -572,7 +572,7 @@ def test_contexted_kv_attention_alibi( query_start = query_end key_start = key_end - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.time() print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms") atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6 diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index 416395e59..2dca0da07 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -127,7 +127,7 @@ def test_fused_rms_norm_quant( out_quant, x_unfused.contiguous(), quant_scale_t ) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2) opcheck( torch.ops._C.fused_add_rms_norm_static_fp8_quant, diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py index 2554c4fce..9f0f9f2ea 100644 --- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py +++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py @@ -34,7 +34,7 @@ def do_profile( record_shapes=True, ) as tprof: fn(**fn_kwargs) - torch.cuda.synchronize(torch.cuda.current_device()) + torch.accelerator.synchronize(torch.cuda.current_device()) # TODO (varun): Add a descriptive trace file name tprof.export_chrome_trace( diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index a74e739c5..7011786f2 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -318,8 +318,8 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch) out = deep_gemm_moe_fp8_fn( a, w1, w2, w1_s, w2_s, topk_weights, topk_ids ) - torch.cuda.synchronize() + torch.accelerator.synchronize() graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(out, ref_out, atol=0.035, rtol=0.035) diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 1ec2c614c..c1cf8b2d3 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -399,9 +399,9 @@ def test_cutlass_moe_8_bit_cuda_graph( mt, topk_weights, topk_ids, per_act_token, per_out_ch ) - torch.cuda.synchronize() + torch.accelerator.synchronize() graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(triton_output, cutlass_output, atol=9e-2, rtol=1e-2) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index f8e2a8b52..43bdd03cf 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -272,9 +272,9 @@ def run_moe_test( global_num_experts=global_num_experts, expert_map=expert_map, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(test_output, baseline_output, atol=atol, rtol=rtol) @@ -768,7 +768,7 @@ def test_mixtral_moe( F.pad(vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128], requires_grad=False, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.accelerator.empty_cache() # FIXME (zyongye) fix this after we move self.kernel diff --git a/tests/kernels/quantization/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py index 7f6adbd52..b6272557c 100644 --- a/tests/kernels/quantization/test_allspark_gemm.py +++ b/tests/kernels/quantization/test_allspark_gemm.py @@ -122,7 +122,7 @@ def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype): ) output_ref = torch.matmul(input, w_ref) - torch.cuda.synchronize() + torch.accelerator.synchronize() max_diff = compute_max_diff(output, output_ref) assert max_diff < 0.04 diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py index de0e347d8..5e6c170db 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py +++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py @@ -269,7 +269,7 @@ def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero): setup.c_strides, setup.group_scale_strides, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() out_ref = compute_moe_reference_output(setup) torch.testing.assert_close(setup.out, out_ref, rtol=1e-2, atol=1e-2) diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index 3453753ec..f918212f7 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -260,7 +260,7 @@ def test_gptq_marlin_repack( marlin_q_w_2 = ops.gptq_marlin_repack( q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits, is_a_8bit ) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2) @@ -308,7 +308,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, is_a_8bit, nk_factors): marlin_q_w_2 = ops.awq_marlin_repack( q_w_awq, size_k, size_n, quant_type.size_bits, is_a_8bit ) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2) @@ -564,7 +564,7 @@ def test_marlin_gemm_subset_input(): ) output_ref = torch.matmul(a_input, w_ref) - torch.cuda.synchronize() + torch.accelerator.synchronize() max_diff = compute_max_diff(output, output_ref) @@ -613,7 +613,7 @@ def test_marlin_gemm_with_bias(size_m): ) output_ref = torch.matmul(a_input, w_ref) + b_bias.view(1, -1) - torch.cuda.synchronize() + torch.accelerator.synchronize() max_diff = compute_max_diff(output, output_ref) diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py index b5d66b4ed..4cc8e3b14 100644 --- a/tests/kernels/test_cache_kernels.py +++ b/tests/kernels/test_cache_kernels.py @@ -57,7 +57,7 @@ def test_gather_cache_oob(): seq_starts, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() assert True diff --git a/tests/kernels/test_top_k_per_row.py b/tests/kernels/test_top_k_per_row.py index 9b96e6dfc..f4bfc1666 100644 --- a/tests/kernels/test_top_k_per_row.py +++ b/tests/kernels/test_top_k_per_row.py @@ -219,7 +219,7 @@ def _run_top_k_per_row_decode_test( top_k, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Run reference implementation torch_indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda") diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 1425bb044..b43ac453a 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -195,4 +195,4 @@ def test_models( # unit tests. On ROCm, when using AITER # the memory might not be deallocated completely # before running the next test case - torch.cuda.synchronize() + torch.accelerator.synchronize() diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index e5a047a7c..9d31a3f87 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -196,7 +196,7 @@ def test_compressed_tensors_w8a8_logprobs( ) if current_platform.is_rocm(): - torch.cuda.synchronize() + torch.accelerator.synchronize() def test_compressed_tensors_no_enforce_eager(vllm_runner): diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py index f2e3cbf26..356650863 100644 --- a/tools/pre_commit/check_torch_cuda.py +++ b/tools/pre_commit/check_torch_cuda.py @@ -9,6 +9,7 @@ import regex as re # --------------------------------------------------------------------------- # _TORCH_CUDA_PATTERNS = [ r"\btorch\.cuda\.empty_cache\b", + r"\btorch\.cuda\.synchronize\b", ] ALLOWED_FILES = {"vllm/platforms/", "vllm/device_allocator/"} diff --git a/vllm/distributed/elastic_ep/elastic_execute.py b/vllm/distributed/elastic_ep/elastic_execute.py index f32ea39fb..516d2c256 100644 --- a/vllm/distributed/elastic_ep/elastic_execute.py +++ b/vllm/distributed/elastic_ep/elastic_execute.py @@ -217,7 +217,7 @@ class ElasticEPScalingExecutor: dp_group=standby_dp_group, expert_weights=model.expert_weights, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() def broadcast_expert_mapping(self) -> None: standby_dp_group = get_standby_dp_group() @@ -407,7 +407,7 @@ class ElasticEPScalingExecutor: reset_compile_wrapper(self.worker.model_runner.get_model()) gc.collect() - torch.cuda.synchronize() + torch.accelerator.synchronize() torch.accelerator.empty_cache() unlock_workspace() self.worker.compile_or_warm_up_model() @@ -446,7 +446,7 @@ class ElasticEPScalingExecutor: eplb_state.rearrange(rank_mapping=rank_mapping) # NOTE(yongji): check whether we need to synchronize here - torch.cuda.synchronize() + torch.accelerator.synchronize() # reset expert_rearrangement_step to ensure all ranks are synchronized eplb_state.expert_rearrangement_step = 0 eplb_state.num_valid_physical_experts = ( @@ -491,7 +491,7 @@ class ElasticEPScalingExecutor: dp_group=dp_group, expert_weights=model.expert_weights, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() def receive_expert_mapping(self) -> tuple[torch.Tensor, int, int]: dp_group = get_dp_group() diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index 777f9c553..7823ce4a3 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -622,7 +622,7 @@ def rearrange_expert_weights_inplace( # NOTE(bowen): We need this synchronize to run, but I don't know why. # If you figure out the reason, please let me know -- thank you! - torch.cuda.synchronize() + torch.accelerator.synchronize() old_global_expert_indices_cpu = old_global_expert_indices.cpu().numpy() new_global_expert_indices_cpu = new_global_expert_indices.cpu().numpy() diff --git a/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py b/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py index 553f3cb04..184a7f71d 100644 --- a/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py +++ b/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py @@ -77,7 +77,7 @@ class CutlassW4A8LinearKernel(MPLinearKernel): def transform_w_q(x): assert isinstance(x, BasevLLMParameter) convert_packed_uint4b8_to_signed_int4_inplace(x.data) - torch.cuda.synchronize() + torch.accelerator.synchronize() permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t()) return x diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index db158e4fe..8cb65c4d2 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -457,7 +457,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): else: self._dummy_pooler_run(hidden_states) - torch.cuda.synchronize() + torch.accelerator.synchronize() del hidden_states, sample_hidden_states gc.collect() @@ -525,7 +525,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # to trigger JIT compilation. if all("FLASHINFER" in b.get_name() for b in self.attn_backends.values()): self._dummy_run(self.max_num_tokens, skip_attn=False) - torch.cuda.synchronize() + torch.accelerator.synchronize() def finish_requests(self, scheduler_output: SchedulerOutput) -> None: finished_req_ids = scheduler_output.finished_req_ids diff --git a/vllm/v1/worker/gpu/warmup.py b/vllm/v1/worker/gpu/warmup.py index ffe5b33f7..9d70a56f5 100644 --- a/vllm/v1/worker/gpu/warmup.py +++ b/vllm/v1/worker/gpu/warmup.py @@ -102,4 +102,4 @@ def warmup_kernels(model_runner: GPUModelRunner) -> None: cleanup_output.finished_req_ids = set(req_ids) model_runner.execute_model(cleanup_output) model_runner.kv_connector.set_disabled(False) - torch.cuda.synchronize() + torch.accelerator.synchronize() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 29fe9ec83..29a5e46ab 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -928,7 +928,7 @@ class GPUModelRunner( # Note: used for model runner override. def _sync_device(self) -> None: - torch.cuda.synchronize() + torch.accelerator.synchronize() def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler @@ -5345,7 +5345,7 @@ class GPUModelRunner( cudagraph_runtime_mode=runtime_mode, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() end_free_gpu_memory = torch.cuda.mem_get_info()[0] # Disable cudagraph capturing globally, so any unexpected cudagraph @@ -6266,13 +6266,13 @@ class GPUModelRunner( group_refs = group_lora_refs[current_item_idx : current_item_idx + num_items] group_request_ids = {req_id for req_id, _ in group_refs} - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.perf_counter() try: yield finally: - torch.cuda.synchronize() + torch.accelerator.synchronize() elapsed = time.perf_counter() - start_time per_request_time = elapsed / max(len(group_request_ids), 1) diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py index 540c9cb20..ddefa7495 100644 --- a/vllm/v1/worker/xpu_model_runner.py +++ b/vllm/v1/worker/xpu_model_runner.py @@ -29,9 +29,6 @@ class XPUModelRunner(GPUModelRunner): # FIXME: To be verified. self.cascade_attn_enabled = False - def _sync_device(self) -> None: - torch.xpu.synchronize() - @contextmanager def _torch_cuda_wrapper(): @@ -42,7 +39,6 @@ def _torch_cuda_wrapper(): torch.cuda.current_stream = torch.xpu.current_stream torch.cuda.stream = torch.xpu.stream torch.cuda.mem_get_info = torch.xpu.mem_get_info - torch.cuda.synchronize = torch.xpu.synchronize if supports_xpu_graph(): torch.cuda.graph = torch.xpu.graph torch.cuda.CUDAGraph = torch.xpu.XPUGraph