From f68e3ea4e135c41b997d8226186509659b3d722c Mon Sep 17 00:00:00 2001 From: Jinwu <70835312+ayrnb@users.noreply.github.com> Date: Sat, 31 Jan 2026 00:14:54 -0800 Subject: [PATCH] [BugFix] Add synchronize in CutlassW4A8LinearKernel to ensure data is ready for use. (#33078) Co-authored-by: jinwuguo Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- .../layers/quantization/kernels/mixed_precision/cutlass.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py index c9c1a3abf..553f3cb04 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py @@ -77,6 +77,7 @@ class CutlassW4A8LinearKernel(MPLinearKernel): def transform_w_q(x): assert isinstance(x, BasevLLMParameter) convert_packed_uint4b8_to_signed_int4_inplace(x.data) + torch.cuda.synchronize() permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t()) return x