[V1] Use input_ids as input for text-only models (#11032)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -61,6 +61,7 @@ class GPUModelRunner:
|
|||||||
self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
|
self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
|
||||||
cache_config.cache_dtype]
|
cache_config.cache_dtype]
|
||||||
|
|
||||||
|
self.is_multimodal_model = model_config.is_multimodal_model
|
||||||
self.sliding_window = model_config.get_sliding_window()
|
self.sliding_window = model_config.get_sliding_window()
|
||||||
self.block_size = cache_config.block_size
|
self.block_size = cache_config.block_size
|
||||||
self.max_model_len = model_config.max_model_len
|
self.max_model_len = model_config.max_model_len
|
||||||
@@ -103,6 +104,11 @@ class GPUModelRunner:
|
|||||||
# The batch sizes in the config are in descending order.
|
# The batch sizes in the config are in descending order.
|
||||||
self.cudagraph_batch_sizes = list(
|
self.cudagraph_batch_sizes = list(
|
||||||
reversed(self.vllm_config.compilation_config.capture_sizes))
|
reversed(self.vllm_config.compilation_config.capture_sizes))
|
||||||
|
|
||||||
|
# Persistent buffers for CUDA graphs.
|
||||||
|
self.input_ids = torch.zeros(self.max_num_tokens,
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=self.device)
|
||||||
self.positions = torch.zeros(self.max_num_tokens,
|
self.positions = torch.zeros(self.max_num_tokens,
|
||||||
dtype=torch.int64,
|
dtype=torch.int64,
|
||||||
device=self.device)
|
device=self.device)
|
||||||
@@ -310,7 +316,8 @@ class GPUModelRunner:
|
|||||||
seq_start_loc_np[0] = 0
|
seq_start_loc_np[0] = 0
|
||||||
np.cumsum(seq_lens, out=seq_start_loc_np[1:])
|
np.cumsum(seq_lens, out=seq_start_loc_np[1:])
|
||||||
|
|
||||||
input_ids = input_ids.to(self.device, non_blocking=True)
|
self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
|
||||||
|
non_blocking=True)
|
||||||
self.positions[:total_num_scheduled_tokens].copy_(positions,
|
self.positions[:total_num_scheduled_tokens].copy_(positions,
|
||||||
non_blocking=True)
|
non_blocking=True)
|
||||||
query_start_loc = query_start_loc.to(self.device, non_blocking=True)
|
query_start_loc = query_start_loc.to(self.device, non_blocking=True)
|
||||||
@@ -331,7 +338,7 @@ class GPUModelRunner:
|
|||||||
# token from the partial request.
|
# token from the partial request.
|
||||||
# TODO: Support prompt logprobs.
|
# TODO: Support prompt logprobs.
|
||||||
logits_indices = query_start_loc[1:] - 1
|
logits_indices = query_start_loc[1:] - 1
|
||||||
return input_ids, attn_metadata, logits_indices
|
return attn_metadata, logits_indices
|
||||||
|
|
||||||
def _prepare_sampling(
|
def _prepare_sampling(
|
||||||
self,
|
self,
|
||||||
@@ -427,13 +434,15 @@ class GPUModelRunner:
|
|||||||
) -> ModelRunnerOutput:
|
) -> ModelRunnerOutput:
|
||||||
self._update_states(scheduler_output)
|
self._update_states(scheduler_output)
|
||||||
|
|
||||||
# Run the encoder.
|
if self.is_multimodal_model:
|
||||||
|
# Run the multimodal encoder if any.
|
||||||
self._execute_encoder(scheduler_output)
|
self._execute_encoder(scheduler_output)
|
||||||
encoder_outputs = self._gather_encoder_outputs(scheduler_output)
|
encoder_outputs = self._gather_encoder_outputs(scheduler_output)
|
||||||
|
else:
|
||||||
|
encoder_outputs = []
|
||||||
|
|
||||||
# Prepare the decoder inputs.
|
# Prepare the decoder inputs.
|
||||||
input_ids, attn_metadata, logits_indices = self._prepare_inputs(
|
attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
|
||||||
scheduler_output)
|
|
||||||
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||||
if (self.use_cuda_graph
|
if (self.use_cuda_graph
|
||||||
and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
|
and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
|
||||||
@@ -444,29 +453,39 @@ class GPUModelRunner:
|
|||||||
else:
|
else:
|
||||||
# Eager mode.
|
# Eager mode.
|
||||||
num_input_tokens = num_scheduled_tokens
|
num_input_tokens = num_scheduled_tokens
|
||||||
|
|
||||||
attn_metadata.num_input_tokens = num_input_tokens
|
attn_metadata.num_input_tokens = num_input_tokens
|
||||||
|
|
||||||
# Get the inputs embeds.
|
if self.is_multimodal_model:
|
||||||
|
# NOTE(woosuk): To unify token ids and soft tokens (vision
|
||||||
|
# embeddings), we always use embeddings (rather than token ids)
|
||||||
|
# as input to the multimodal model, even when the input is text.
|
||||||
|
input_ids = self.input_ids[:num_scheduled_tokens]
|
||||||
if encoder_outputs:
|
if encoder_outputs:
|
||||||
inputs_embeds = self.model.get_input_embeddings(
|
inputs_embeds = self.model.get_input_embeddings(
|
||||||
input_ids, encoder_outputs)
|
input_ids, encoder_outputs)
|
||||||
else:
|
else:
|
||||||
inputs_embeds = self.model.get_input_embeddings(input_ids)
|
inputs_embeds = self.model.get_input_embeddings(input_ids)
|
||||||
# NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
|
|
||||||
# always use embeddings (rather than token ids) as input to the model.
|
|
||||||
# TODO(woosuk): Avoid the copy. Optimize.
|
# TODO(woosuk): Avoid the copy. Optimize.
|
||||||
self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
|
self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
|
||||||
|
inputs_embeds = self.inputs_embeds[:num_input_tokens]
|
||||||
|
input_ids = None
|
||||||
|
else:
|
||||||
|
# For text-only models, we use token ids as input.
|
||||||
|
# While it is possible to use embeddings as input just like the
|
||||||
|
# multimodal models, it is not desirable for performance since
|
||||||
|
# then the embedding layer is not included in the CUDA graph.
|
||||||
|
input_ids = self.input_ids[:num_input_tokens]
|
||||||
|
inputs_embeds = None
|
||||||
|
|
||||||
# Run the decoder.
|
# Run the decoder.
|
||||||
# Use persistent buffers for CUDA graphs.
|
# Use persistent buffers for CUDA graphs.
|
||||||
with set_forward_context(attn_metadata, self.vllm_config):
|
with set_forward_context(attn_metadata, self.vllm_config):
|
||||||
hidden_states = self.model(
|
hidden_states = self.model(
|
||||||
input_ids=None,
|
input_ids=input_ids,
|
||||||
positions=self.positions[:num_input_tokens],
|
positions=self.positions[:num_input_tokens],
|
||||||
kv_caches=self.kv_caches,
|
kv_caches=self.kv_caches,
|
||||||
attn_metadata=None,
|
attn_metadata=None,
|
||||||
inputs_embeds=self.inputs_embeds[:num_input_tokens],
|
inputs_embeds=inputs_embeds,
|
||||||
)
|
)
|
||||||
hidden_states = hidden_states[:num_scheduled_tokens]
|
hidden_states = hidden_states[:num_scheduled_tokens]
|
||||||
hidden_states = hidden_states[logits_indices]
|
hidden_states = hidden_states[logits_indices]
|
||||||
@@ -534,13 +553,20 @@ class GPUModelRunner:
|
|||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
kv_caches: List[torch.Tensor],
|
kv_caches: List[torch.Tensor],
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
|
if self.is_multimodal_model:
|
||||||
|
input_ids = None
|
||||||
|
inputs_embeds = self.inputs_embeds[:num_tokens]
|
||||||
|
else:
|
||||||
|
input_ids = self.input_ids[:num_tokens]
|
||||||
|
inputs_embeds = None
|
||||||
with set_forward_context(None, self.vllm_config):
|
with set_forward_context(None, self.vllm_config):
|
||||||
hidden_states = model(
|
hidden_states = model(
|
||||||
input_ids=None,
|
input_ids=input_ids,
|
||||||
positions=self.positions[:num_tokens],
|
positions=self.positions[:num_tokens],
|
||||||
kv_caches=kv_caches,
|
kv_caches=kv_caches,
|
||||||
attn_metadata=None,
|
attn_metadata=None,
|
||||||
inputs_embeds=self.inputs_embeds[:num_tokens])
|
inputs_embeds=inputs_embeds,
|
||||||
|
)
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
def profile_run(self) -> None:
|
def profile_run(self) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user