fix: use raw cudaDeviceSynchronize instead of DG_CUDA_CHECK macro
This commit is contained in:
@@ -91,8 +91,8 @@ static void __instantiate_kernel() {{
|
||||
static void launch_impl(const KernelHandle& kernel, const LaunchConfigHandle& config, Args args) {
|
||||
// Debug: sync before launch to flush TMA creation errors
|
||||
if (get_env<int>("DG_JIT_DEBUG")) {
|
||||
DG_CUDA_CHECK(cudaDeviceSynchronize());
|
||||
printf("[MEGA_MOE_LAUNCH_DEBUG] About to launch kernel, y=%p, num_tokens=%d\n", args.y, args.num_tokens);
|
||||
auto err = cudaDeviceSynchronize();
|
||||
printf("[MEGA_MOE_LAUNCH_DEBUG] Pre-launch sync: %d, about to launch kernel, y=%p, num_tokens=%d\n", (int)err, args.y, args.num_tokens);
|
||||
fflush(stdout);
|
||||
}
|
||||
DG_CUDA_UNIFIED_CHECK(launch_kernel(kernel, config,
|
||||
|
||||
Reference in New Issue
Block a user