[BugFix] Re-fix async multimodal cpu tensor race condition (#31373)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
Signed-off-by: njhill <nickhill123@gmail.com>
This commit is contained in:
Nick Hill
2025-12-28 03:05:08 -08:00
committed by GitHub
parent 573dd0e6f0
commit 094fcce250

View File

@@ -3058,8 +3058,10 @@ class GPUModelRunner(
scheduler_output = deepcopy(scheduler_output) scheduler_output = deepcopy(scheduler_output)
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
with record_function_or_nullcontext("gpu_model_runner: preprocess"): with (
with self.synchronize_input_prep(): record_function_or_nullcontext("gpu_model_runner: preprocess"),
self.synchronize_input_prep(),
):
# Update persistent batch states. # Update persistent batch states.
self._update_states(scheduler_output) self._update_states(scheduler_output)
@@ -3087,9 +3089,8 @@ class GPUModelRunner(
if not has_kv_transfer_group(): if not has_kv_transfer_group():
# Return empty ModelRunnerOutput if no work to do. # Return empty ModelRunnerOutput if no work to do.
return EMPTY_MODEL_RUNNER_OUTPUT return EMPTY_MODEL_RUNNER_OUTPUT
return self.kv_connector_no_forward( return self.kv_connector_no_forward(scheduler_output, self.vllm_config)
scheduler_output, self.vllm_config
)
if self.cache_config.kv_sharing_fast_prefill: if self.cache_config.kv_sharing_fast_prefill:
assert not self.num_prompt_logprobs, ( assert not self.num_prompt_logprobs, (
"--kv-sharing-fast-prefill produces incorrect " "--kv-sharing-fast-prefill produces incorrect "
@@ -3104,10 +3105,7 @@ class GPUModelRunner(
max_num_scheduled_tokens = int(num_scheduled_tokens_np.max()) max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
( logits_indices, spec_decode_metadata = self._prepare_inputs(
logits_indices,
spec_decode_metadata,
) = self._prepare_inputs(
scheduler_output, scheduler_output,
num_scheduled_tokens_np, num_scheduled_tokens_np,
) )
@@ -3169,7 +3167,7 @@ class GPUModelRunner(
use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
(attn_metadata, spec_decode_common_attn_metadata) = ( attn_metadata, spec_decode_common_attn_metadata = (
self._build_attention_metadata( self._build_attention_metadata(
num_tokens=num_tokens_unpadded, num_tokens=num_tokens_unpadded,
num_tokens_padded=num_tokens_padded if pad_attn else None, num_tokens_padded=num_tokens_padded if pad_attn else None,