[BugFix] Re-fix async multimodal cpu tensor race condition (#31373)
Signed-off-by: Nick Hill <nickhill123@gmail.com> Signed-off-by: njhill <nickhill123@gmail.com>
This commit is contained in:
@@ -3058,8 +3058,10 @@ class GPUModelRunner(
|
|||||||
scheduler_output = deepcopy(scheduler_output)
|
scheduler_output = deepcopy(scheduler_output)
|
||||||
|
|
||||||
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||||
with record_function_or_nullcontext("gpu_model_runner: preprocess"):
|
with (
|
||||||
with self.synchronize_input_prep():
|
record_function_or_nullcontext("gpu_model_runner: preprocess"),
|
||||||
|
self.synchronize_input_prep(),
|
||||||
|
):
|
||||||
# Update persistent batch states.
|
# Update persistent batch states.
|
||||||
self._update_states(scheduler_output)
|
self._update_states(scheduler_output)
|
||||||
|
|
||||||
@@ -3087,9 +3089,8 @@ class GPUModelRunner(
|
|||||||
if not has_kv_transfer_group():
|
if not has_kv_transfer_group():
|
||||||
# Return empty ModelRunnerOutput if no work to do.
|
# Return empty ModelRunnerOutput if no work to do.
|
||||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||||
return self.kv_connector_no_forward(
|
return self.kv_connector_no_forward(scheduler_output, self.vllm_config)
|
||||||
scheduler_output, self.vllm_config
|
|
||||||
)
|
|
||||||
if self.cache_config.kv_sharing_fast_prefill:
|
if self.cache_config.kv_sharing_fast_prefill:
|
||||||
assert not self.num_prompt_logprobs, (
|
assert not self.num_prompt_logprobs, (
|
||||||
"--kv-sharing-fast-prefill produces incorrect "
|
"--kv-sharing-fast-prefill produces incorrect "
|
||||||
@@ -3104,10 +3105,7 @@ class GPUModelRunner(
|
|||||||
max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
|
max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
|
||||||
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
||||||
|
|
||||||
(
|
logits_indices, spec_decode_metadata = self._prepare_inputs(
|
||||||
logits_indices,
|
|
||||||
spec_decode_metadata,
|
|
||||||
) = self._prepare_inputs(
|
|
||||||
scheduler_output,
|
scheduler_output,
|
||||||
num_scheduled_tokens_np,
|
num_scheduled_tokens_np,
|
||||||
)
|
)
|
||||||
@@ -3169,7 +3167,7 @@ class GPUModelRunner(
|
|||||||
use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
|
use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
|
||||||
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
|
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
|
||||||
|
|
||||||
(attn_metadata, spec_decode_common_attn_metadata) = (
|
attn_metadata, spec_decode_common_attn_metadata = (
|
||||||
self._build_attention_metadata(
|
self._build_attention_metadata(
|
||||||
num_tokens=num_tokens_unpadded,
|
num_tokens=num_tokens_unpadded,
|
||||||
num_tokens_padded=num_tokens_padded if pad_attn else None,
|
num_tokens_padded=num_tokens_padded if pad_attn else None,
|
||||||
|
|||||||
Reference in New Issue
Block a user