diff --git a/dsv4/kernels/cuda/blackwell_swizzle.cu b/dsv4/kernels/cuda/blackwell_swizzle.cu index 9a2e5079..71f2029b 100644 --- a/dsv4/kernels/cuda/blackwell_swizzle.cu +++ b/dsv4/kernels/cuda/blackwell_swizzle.cu @@ -104,11 +104,12 @@ void launch_blackwell_swizzle( // Pybind11 bindings for torch.utils.cpp_extension.load PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("blackwell_swizzle_32_4_4", [](at::Tensor input, at::Tensor output, int32_t rows, int32_t cols) { + auto stream = at::cuda::getCurrentCUDAStream().stream(); launch_blackwell_swizzle( input.data_ptr(), output.data_ptr(), rows, cols, - at::cuda::getCurrentCUDAStream() + stream ); }, "Blackwell 32_4_4 scale swizzle"); }