fix: use raw cudaDeviceSynchronize instead of DG_CUDA_CHECK macro

This commit is contained in:
2026-05-13 12:17:26 +00:00
parent c08a28888d
commit 6a348d543d

View File

@@ -91,8 +91,8 @@ static void __instantiate_kernel() {{
static void launch_impl(const KernelHandle& kernel, const LaunchConfigHandle& config, Args args) {
// Debug: sync before launch to flush TMA creation errors
if (get_env<int>("DG_JIT_DEBUG")) {
DG_CUDA_CHECK(cudaDeviceSynchronize());
printf("[MEGA_MOE_LAUNCH_DEBUG] About to launch kernel, y=%p, num_tokens=%d\n", args.y, args.num_tokens);
auto err = cudaDeviceSynchronize();
printf("[MEGA_MOE_LAUNCH_DEBUG] Pre-launch sync: %d, about to launch kernel, y=%p, num_tokens=%d\n", (int)err, args.y, args.num_tokens);
fflush(stdout);
}
DG_CUDA_UNIFIED_CHECK(launch_kernel(kernel, config,