From c08a28888dd5259d0abea844c3a54aad87e0a496 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Wed, 13 May 2026 12:15:49 +0000 Subject: [PATCH] debug: sync + printf before mega_moe kernel launch --- csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp b/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp index f99de14..52b245e 100644 --- a/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp +++ b/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp @@ -89,6 +89,12 @@ static void __instantiate_kernel() {{ } static void launch_impl(const KernelHandle& kernel, const LaunchConfigHandle& config, Args args) { + // Debug: sync before launch to flush TMA creation errors + if (get_env("DG_JIT_DEBUG")) { + DG_CUDA_CHECK(cudaDeviceSynchronize()); + printf("[MEGA_MOE_LAUNCH_DEBUG] About to launch kernel, y=%p, num_tokens=%d\n", args.y, args.num_tokens); + fflush(stdout); + } DG_CUDA_UNIFIED_CHECK(launch_kernel(kernel, config, args.y, args.cumulative_local_expert_recv_stats,