diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py index 7cc4ee3c1..3d719940e 100644 --- a/tests/models/test_utils.py +++ b/tests/models/test_utils.py @@ -4,9 +4,11 @@ import pytest import torch -from vllm.model_executor.models.utils import AutoWeightsLoader - -pytestmark = pytest.mark.cpu_test +from vllm.model_executor.models.utils import ( + AutoWeightsLoader, + _merge_multimodal_embeddings, +) +from vllm.platforms import current_platform class ModuleWithBatchNorm(torch.nn.Module): @@ -27,6 +29,7 @@ class ModuleWithNestedBatchNorm(torch.nn.Module): return self.nested_mod(x) +@pytest.mark.cpu_test def test_module_with_batchnorm_can_load(): """Ensure the auto weight loader can load batchnorm stats.""" mod = ModuleWithBatchNorm() @@ -52,6 +55,7 @@ def test_module_with_batchnorm_can_load(): assert new_mod.bn.num_batches_tracked.item() == 1 +@pytest.mark.cpu_test def test_module_with_child_containing_batchnorm_can_autoload(): """Ensure the auto weight loader can load nested modules batchnorm stats.""" mod = ModuleWithNestedBatchNorm() @@ -83,6 +87,7 @@ def test_module_with_child_containing_batchnorm_can_autoload(): assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1 +@pytest.mark.cpu_test def test_module_skip_prefix(): """Ensure the auto weight loader can skip prefix.""" mod = ModuleWithNestedBatchNorm() @@ -119,6 +124,7 @@ def test_module_skip_prefix(): assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1 +@pytest.mark.cpu_test def test_module_skip_substr(): """Ensure the auto weight loader can skip prefix.""" mod = ModuleWithNestedBatchNorm() @@ -155,3 +161,23 @@ def test_module_skip_substr(): ) assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var) assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1 + + +class raise_if_cuda_sync: + def __enter__(self): + self.previous_debug_mode = torch.cuda.get_sync_debug_mode() + torch.cuda.set_sync_debug_mode("error") + + def __exit__(self, exception_type, exception_value, traceback): + torch.cuda.set_sync_debug_mode(self.previous_debug_mode) + + +@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") +def test_merge_multimodal_embeddings_no_sync(): + inputs_embeds = torch.zeros([5, 10], dtype=torch.bfloat16, device="cuda:0") + multimodal_embeddings = [torch.ones([3, 10], dtype=torch.bfloat16, device="cuda:0")] + is_multimodal = torch.tensor([True, False, True, True, False], device="cpu") + with raise_if_cuda_sync(): + _merge_multimodal_embeddings( + inputs_embeds, multimodal_embeddings, is_multimodal + ) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index fa59931d0..df7170aab 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -362,7 +362,9 @@ class SupportsMultiModal(Protocol): # to ensure that any external configuration requiring offset tracking, # e.g., LoRA, are applied correctly regardless of whether or not # we have multimodal tokens. - in_vocab_ids = input_ids.masked_fill(is_multimodal, 0) + in_vocab_ids = input_ids.masked_fill( + is_multimodal.to(device=input_ids.device, non_blocking=True), 0 + ) return embed_input_ids(in_vocab_ids) return embed_input_ids(input_ids) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 0c11a2109..90425417a 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1215,7 +1215,6 @@ class NemotronH_Nano_VL_V2( These embeddings will replace the placeholder embeddings to create input_embeds for the LLM. """ - device = video_embeddings.device tokenizer = cached_tokenizer_from_config(self.model_config) # Generate video replacement token IDs using get_video_repl @@ -1234,10 +1233,10 @@ class NemotronH_Nano_VL_V2( ) # video_repl.full is a list of token IDs - repl_token_ids = torch.tensor(video_repl.full, device=device) + repl_token_ids = torch.tensor(video_repl.full) # Get embedding token IDs for image context (use pre-tokenized version) - embed_token_ids = torch.tensor(self._img_context_token_ids, device=device) + embed_token_ids = torch.tensor(self._img_context_token_ids) # Create mask for video embedding positions is_video_embed = torch.isin(repl_token_ids, embed_token_ids) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 8e106baec..7ce7dc831 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -211,15 +211,12 @@ def merge_interleaved_embeddings( # Scatter each modality to its positions if video_embeds: - video_positions = is_video.nonzero(as_tuple=True)[0] - inputs_embeds[video_positions] = torch.cat(video_embeds, dim=0) + inputs_embeds[is_video] = torch.cat(video_embeds, dim=0) if audio_embeds: - audio_positions = is_audio.nonzero(as_tuple=True)[0] - inputs_embeds[audio_positions] = torch.cat(audio_embeds, dim=0) + inputs_embeds[is_audio] = torch.cat(audio_embeds, dim=0) if other_embeds: other_mask = is_multimodal & ~is_video & ~is_audio - other_positions = other_mask.nonzero(as_tuple=True)[0] - inputs_embeds[other_positions] = torch.cat(other_embeds, dim=0) + inputs_embeds[other_mask] = torch.cat(other_embeds, dim=0) return inputs_embeds @@ -1457,8 +1454,9 @@ class Qwen2_5OmniThinkerForConditionalGeneration( video_token_id = self.config.video_token_index audio_token_id = self.config.audio_token_index - is_video = is_multimodal & (input_ids == video_token_id) - is_audio = is_multimodal & (input_ids == audio_token_id) + input_ids_cpu = input_ids.cpu() + is_video = is_multimodal & (input_ids_cpu == video_token_id) + is_audio = is_multimodal & (input_ids_cpu == audio_token_id) num_video = is_video.sum().item() num_audio = is_audio.sum().item() diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index d9bc02c65..5e113dac5 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1869,8 +1869,9 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( # both the deepstack path and the final embedding merge. video_token_id = self.config.video_token_id audio_token_id = self.config.audio_token_id - is_video = is_multimodal & (input_ids == video_token_id) - is_audio = is_multimodal & (input_ids == audio_token_id) + input_ids_cpu = input_ids.cpu() + is_video = is_multimodal & (input_ids_cpu == video_token_id) + is_audio = is_multimodal & (input_ids_cpu == audio_token_id) num_video = is_video.sum().item() num_audio = is_audio.sum().item() diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 418b75e38..431371c12 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1977,7 +1977,6 @@ class Qwen3VLForConditionalGeneration( These embeddings will replace the placeholder embeddings to create input_embeds for the LLM. """ - device = video_embeddings.device # Generate video replacement token IDs using get_video_repl # This tokenizes each frame separator independently, then uses pre-tokenized @@ -1993,8 +1992,10 @@ class Qwen3VLForConditionalGeneration( select_token_id=self.is_multimodal_pruning_enabled, ) - repl_token_ids = torch.tensor(video_repl.full, device=device) - embed_token_id = _cached_tensor(self.config.video_token_id, device=device) + repl_token_ids = torch.tensor(video_repl.full) + embed_token_id = _cached_tensor( + self.config.video_token_id, repl_token_ids.device + ) is_video_embed = torch.isin(repl_token_ids, embed_token_id) # Get text embeddings for indicator tokens (has only `visual_dim``). diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 8abaa557f..1fd18b547 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -468,14 +468,8 @@ def _merge_multimodal_embeddings( input_dtype = inputs_embeds.dtype try: - # For debugging - # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) - - # NOTE: This can avoid D2H sync (#22105), but fails to - # raise an error if is_multimodal.sum() < len(mm_embeds_flat) - inputs_embeds.masked_scatter_( - is_multimodal.unsqueeze(-1), mm_embeds_flat.to(dtype=input_dtype) - ) + # If is_multimodal is on CPU this avoids a D2H sync + inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) except RuntimeError as e: num_actual_tokens = len(mm_embeds_flat) num_expected_tokens = is_multimodal.sum().item() @@ -488,7 +482,7 @@ def _merge_multimodal_embeddings( f"multimodal tokens to {num_expected_tokens} placeholders" ) from e - raise ValueError("Error during masked scatter operation") from e + raise ValueError("Error during index put operation") from e return inputs_embeds diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index fb2a21ce4..1000dbe05 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -83,7 +83,7 @@ class EncoderRunner: mm_embeds: list[torch.Tensor] = [] is_mm_embed = torch.zeros( - total_num_scheduled_tokens, dtype=torch.bool, device="cpu", pin_memory=True + total_num_scheduled_tokens, dtype=torch.bool, device="cpu" ) for i, req_id in enumerate(req_ids): if not is_prefilling[i]: @@ -131,8 +131,6 @@ class EncoderRunner: ) mm_embeds.append(mm_embeds_item) - # Copy the is_mm_embed tensor to the GPU. - is_mm_embed = is_mm_embed.to(device=self.device, non_blocking=True) return mm_embeds, is_mm_embed @torch.inference_mode() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 02450ef4e..979ff8d33 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -719,16 +719,6 @@ class GPUModelRunner( self.max_num_reqs, dtype=torch.int32 ) - # Only relevant for multimodal models - if self.supports_mm_inputs: - # Double buffer to avoid race condition: previous iteration's async - # copy may still be reading from CPU while current iteration writes. - self.is_mm_embed_buffers = [ - self._make_buffer(self.max_num_tokens, dtype=torch.bool), - self._make_buffer(self.max_num_tokens, dtype=torch.bool), - ] - self.is_mm_embed_idx = 0 - # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: # NOTE: `mrope_positions` is implemented with one additional dummy @@ -2910,14 +2900,10 @@ class GPUModelRunner( ) -> tuple[list[torch.Tensor], torch.Tensor]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - # Swap to the other buffer to avoid race condition with previous - # iteration's async copy that may still be reading from CPU. - self.is_mm_embed_idx = 1 - self.is_mm_embed_idx - is_mm_embed_buf = self.is_mm_embed_buffers[self.is_mm_embed_idx] - mm_embeds = list[torch.Tensor]() - is_mm_embed = is_mm_embed_buf.cpu - is_mm_embed[:total_num_scheduled_tokens] = False + is_mm_embed = torch.zeros( + total_num_scheduled_tokens, dtype=torch.bool, device="cpu" + ) req_start_idx = 0 should_sync_mrope_positions = False @@ -3000,8 +2986,6 @@ class GPUModelRunner( mm_embeds.extend(mm_embeds_req) req_start_idx += num_scheduled_tokens - is_mm_embed = is_mm_embed_buf.copy_to_gpu(total_num_scheduled_tokens) - if should_sync_mrope_positions: self._calc_mrope_positions(scheduler_output) self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens)