From 6a348d543de00e0bea16f3ed93dac68bf621c308 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Wed, 13 May 2026 12:17:26 +0000 Subject: [PATCH] fix: use raw cudaDeviceSynchronize instead of DG_CUDA_CHECK macro --- csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp b/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp index 52b245e..c76a67e 100644 --- a/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp +++ b/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp @@ -91,8 +91,8 @@ static void __instantiate_kernel() {{ static void launch_impl(const KernelHandle& kernel, const LaunchConfigHandle& config, Args args) { // Debug: sync before launch to flush TMA creation errors if (get_env("DG_JIT_DEBUG")) { - DG_CUDA_CHECK(cudaDeviceSynchronize()); - printf("[MEGA_MOE_LAUNCH_DEBUG] About to launch kernel, y=%p, num_tokens=%d\n", args.y, args.num_tokens); + auto err = cudaDeviceSynchronize(); + printf("[MEGA_MOE_LAUNCH_DEBUG] Pre-launch sync: %d, about to launch kernel, y=%p, num_tokens=%d\n", (int)err, args.y, args.num_tokens); fflush(stdout); } DG_CUDA_UNIFIED_CHECK(launch_kernel(kernel, config,