[BugFix] Re-fix async multimodal cpu tensor race condition (#31373)
Signed-off-by: Nick Hill <nickhill123@gmail.com> Signed-off-by: njhill <nickhill123@gmail.com>
This commit is contained in:
@@ -3058,131 +3058,129 @@ class GPUModelRunner(
|
|||||||
scheduler_output = deepcopy(scheduler_output)
|
scheduler_output = deepcopy(scheduler_output)
|
||||||
|
|
||||||
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||||
with record_function_or_nullcontext("gpu_model_runner: preprocess"):
|
with (
|
||||||
with self.synchronize_input_prep():
|
record_function_or_nullcontext("gpu_model_runner: preprocess"),
|
||||||
# Update persistent batch states.
|
self.synchronize_input_prep(),
|
||||||
self._update_states(scheduler_output)
|
):
|
||||||
|
# Update persistent batch states.
|
||||||
|
self._update_states(scheduler_output)
|
||||||
|
|
||||||
if has_ec_transfer() and get_ec_transfer().is_producer:
|
if has_ec_transfer() and get_ec_transfer().is_producer:
|
||||||
with self.maybe_get_ec_connector_output(
|
with self.maybe_get_ec_connector_output(
|
||||||
scheduler_output,
|
|
||||||
encoder_cache=self.encoder_cache,
|
|
||||||
) as ec_connector_output:
|
|
||||||
self._execute_mm_encoder(scheduler_output)
|
|
||||||
return make_empty_encoder_model_runner_output(scheduler_output)
|
|
||||||
|
|
||||||
if not num_scheduled_tokens:
|
|
||||||
if (
|
|
||||||
self.parallel_config.distributed_executor_backend
|
|
||||||
== "external_launcher"
|
|
||||||
and self.parallel_config.data_parallel_size > 1
|
|
||||||
):
|
|
||||||
# this is a corner case when both external launcher
|
|
||||||
# and DP are enabled, num_scheduled_tokens could be
|
|
||||||
# 0, and has_unfinished_requests in the outer loop
|
|
||||||
# returns True. before returning early here we call
|
|
||||||
# dummy run to ensure coordinate_batch_across_dp
|
|
||||||
# is called into to avoid out of sync issues.
|
|
||||||
self._dummy_run(1)
|
|
||||||
if not has_kv_transfer_group():
|
|
||||||
# Return empty ModelRunnerOutput if no work to do.
|
|
||||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
|
||||||
return self.kv_connector_no_forward(
|
|
||||||
scheduler_output, self.vllm_config
|
|
||||||
)
|
|
||||||
if self.cache_config.kv_sharing_fast_prefill:
|
|
||||||
assert not self.num_prompt_logprobs, (
|
|
||||||
"--kv-sharing-fast-prefill produces incorrect "
|
|
||||||
"logprobs for prompt tokens, tokens, please disable "
|
|
||||||
"it when the requests need prompt logprobs"
|
|
||||||
)
|
|
||||||
|
|
||||||
num_reqs = self.input_batch.num_reqs
|
|
||||||
req_ids = self.input_batch.req_ids
|
|
||||||
tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
|
|
||||||
num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
|
|
||||||
max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
|
|
||||||
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
|
||||||
|
|
||||||
(
|
|
||||||
logits_indices,
|
|
||||||
spec_decode_metadata,
|
|
||||||
) = self._prepare_inputs(
|
|
||||||
scheduler_output,
|
scheduler_output,
|
||||||
num_scheduled_tokens_np,
|
encoder_cache=self.encoder_cache,
|
||||||
|
) as ec_connector_output:
|
||||||
|
self._execute_mm_encoder(scheduler_output)
|
||||||
|
return make_empty_encoder_model_runner_output(scheduler_output)
|
||||||
|
|
||||||
|
if not num_scheduled_tokens:
|
||||||
|
if (
|
||||||
|
self.parallel_config.distributed_executor_backend
|
||||||
|
== "external_launcher"
|
||||||
|
and self.parallel_config.data_parallel_size > 1
|
||||||
|
):
|
||||||
|
# this is a corner case when both external launcher
|
||||||
|
# and DP are enabled, num_scheduled_tokens could be
|
||||||
|
# 0, and has_unfinished_requests in the outer loop
|
||||||
|
# returns True. before returning early here we call
|
||||||
|
# dummy run to ensure coordinate_batch_across_dp
|
||||||
|
# is called into to avoid out of sync issues.
|
||||||
|
self._dummy_run(1)
|
||||||
|
if not has_kv_transfer_group():
|
||||||
|
# Return empty ModelRunnerOutput if no work to do.
|
||||||
|
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||||
|
return self.kv_connector_no_forward(scheduler_output, self.vllm_config)
|
||||||
|
|
||||||
|
if self.cache_config.kv_sharing_fast_prefill:
|
||||||
|
assert not self.num_prompt_logprobs, (
|
||||||
|
"--kv-sharing-fast-prefill produces incorrect "
|
||||||
|
"logprobs for prompt tokens, tokens, please disable "
|
||||||
|
"it when the requests need prompt logprobs"
|
||||||
)
|
)
|
||||||
|
|
||||||
cascade_attn_prefix_lens = None
|
num_reqs = self.input_batch.num_reqs
|
||||||
# Disable cascade attention when using microbatching (DBO)
|
req_ids = self.input_batch.req_ids
|
||||||
if self.cascade_attn_enabled and not self.parallel_config.use_ubatching:
|
tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
|
||||||
# Pre-compute cascade attention prefix lengths
|
num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
|
||||||
cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
|
max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
|
||||||
num_scheduled_tokens_np,
|
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
||||||
self.input_batch.num_computed_tokens_cpu[:num_reqs],
|
|
||||||
scheduler_output.num_common_prefix_blocks,
|
|
||||||
)
|
|
||||||
|
|
||||||
(
|
logits_indices, spec_decode_metadata = self._prepare_inputs(
|
||||||
cudagraph_mode,
|
scheduler_output,
|
||||||
batch_desc,
|
num_scheduled_tokens_np,
|
||||||
should_ubatch,
|
)
|
||||||
num_tokens_across_dp,
|
|
||||||
cudagraph_stats,
|
cascade_attn_prefix_lens = None
|
||||||
) = self._determine_batch_execution_and_padding(
|
# Disable cascade attention when using microbatching (DBO)
|
||||||
|
if self.cascade_attn_enabled and not self.parallel_config.use_ubatching:
|
||||||
|
# Pre-compute cascade attention prefix lengths
|
||||||
|
cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
|
||||||
|
num_scheduled_tokens_np,
|
||||||
|
self.input_batch.num_computed_tokens_cpu[:num_reqs],
|
||||||
|
scheduler_output.num_common_prefix_blocks,
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
cudagraph_mode,
|
||||||
|
batch_desc,
|
||||||
|
should_ubatch,
|
||||||
|
num_tokens_across_dp,
|
||||||
|
cudagraph_stats,
|
||||||
|
) = self._determine_batch_execution_and_padding(
|
||||||
|
num_tokens=num_tokens_unpadded,
|
||||||
|
num_reqs=num_reqs,
|
||||||
|
num_scheduled_tokens_np=num_scheduled_tokens_np,
|
||||||
|
max_num_scheduled_tokens=max_num_scheduled_tokens,
|
||||||
|
use_cascade_attn=cascade_attn_prefix_lens is not None,
|
||||||
|
num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
|
||||||
|
"should_ubatch: %s, num_tokens_across_dp: %s",
|
||||||
|
cudagraph_mode,
|
||||||
|
batch_desc,
|
||||||
|
should_ubatch,
|
||||||
|
num_tokens_across_dp,
|
||||||
|
)
|
||||||
|
|
||||||
|
num_tokens_padded = batch_desc.num_tokens
|
||||||
|
num_reqs_padded = (
|
||||||
|
batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
|
||||||
|
)
|
||||||
|
ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
|
||||||
|
should_ubatch,
|
||||||
|
num_scheduled_tokens_np,
|
||||||
|
num_tokens_padded,
|
||||||
|
num_reqs_padded,
|
||||||
|
self.parallel_config.num_ubatches,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"ubatch_slices: %s, ubatch_slices_padded: %s",
|
||||||
|
ubatch_slices,
|
||||||
|
ubatch_slices_padded,
|
||||||
|
)
|
||||||
|
|
||||||
|
pad_attn = cudagraph_mode == CUDAGraphMode.FULL
|
||||||
|
|
||||||
|
use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
|
||||||
|
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
|
||||||
|
|
||||||
|
attn_metadata, spec_decode_common_attn_metadata = (
|
||||||
|
self._build_attention_metadata(
|
||||||
num_tokens=num_tokens_unpadded,
|
num_tokens=num_tokens_unpadded,
|
||||||
|
num_tokens_padded=num_tokens_padded if pad_attn else None,
|
||||||
num_reqs=num_reqs,
|
num_reqs=num_reqs,
|
||||||
num_scheduled_tokens_np=num_scheduled_tokens_np,
|
num_reqs_padded=num_reqs_padded if pad_attn else None,
|
||||||
max_num_scheduled_tokens=max_num_scheduled_tokens,
|
max_query_len=max_num_scheduled_tokens,
|
||||||
use_cascade_attn=cascade_attn_prefix_lens is not None,
|
ubatch_slices=ubatch_slices_attn,
|
||||||
num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
|
logits_indices=logits_indices,
|
||||||
)
|
use_spec_decode=use_spec_decode,
|
||||||
|
num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
|
||||||
logger.debug(
|
cascade_attn_prefix_lens=cascade_attn_prefix_lens,
|
||||||
"Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
|
|
||||||
"should_ubatch: %s, num_tokens_across_dp: %s",
|
|
||||||
cudagraph_mode,
|
|
||||||
batch_desc,
|
|
||||||
should_ubatch,
|
|
||||||
num_tokens_across_dp,
|
|
||||||
)
|
|
||||||
|
|
||||||
num_tokens_padded = batch_desc.num_tokens
|
|
||||||
num_reqs_padded = (
|
|
||||||
batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
|
|
||||||
)
|
|
||||||
ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
|
|
||||||
should_ubatch,
|
|
||||||
num_scheduled_tokens_np,
|
|
||||||
num_tokens_padded,
|
|
||||||
num_reqs_padded,
|
|
||||||
self.parallel_config.num_ubatches,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
"ubatch_slices: %s, ubatch_slices_padded: %s",
|
|
||||||
ubatch_slices,
|
|
||||||
ubatch_slices_padded,
|
|
||||||
)
|
|
||||||
|
|
||||||
pad_attn = cudagraph_mode == CUDAGraphMode.FULL
|
|
||||||
|
|
||||||
use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
|
|
||||||
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
|
|
||||||
|
|
||||||
(attn_metadata, spec_decode_common_attn_metadata) = (
|
|
||||||
self._build_attention_metadata(
|
|
||||||
num_tokens=num_tokens_unpadded,
|
|
||||||
num_tokens_padded=num_tokens_padded if pad_attn else None,
|
|
||||||
num_reqs=num_reqs,
|
|
||||||
num_reqs_padded=num_reqs_padded if pad_attn else None,
|
|
||||||
max_query_len=max_num_scheduled_tokens,
|
|
||||||
ubatch_slices=ubatch_slices_attn,
|
|
||||||
logits_indices=logits_indices,
|
|
||||||
use_spec_decode=use_spec_decode,
|
|
||||||
num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
|
|
||||||
cascade_attn_prefix_lens=cascade_attn_prefix_lens,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
(
|
(
|
||||||
input_ids,
|
input_ids,
|
||||||
|
|||||||
Reference in New Issue
Block a user