[Bugfix] Fix RoBERTa position_ids accumulation on CUDA graph padding (#37884)
This commit is contained in:
@@ -79,7 +79,14 @@ class RobertaEmbedding(nn.Module):
|
|||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
inputs_embeds = self.word_embeddings(input_ids)
|
inputs_embeds = self.word_embeddings(input_ids)
|
||||||
|
|
||||||
position_embeddings = self.position_embeddings(position_ids)
|
# RoBERTa positions start at padding_idx + 1 instead of 0.
|
||||||
|
# Use non-in-place add to avoid mutating the persistent positions
|
||||||
|
# buffer -- in-place += would accumulate on CUDA graph padding
|
||||||
|
# slots that aren't refreshed between requests, eventually
|
||||||
|
# overflowing max_position_embeddings.
|
||||||
|
position_embeddings = self.position_embeddings(
|
||||||
|
position_ids + self.padding_idx + 1
|
||||||
|
)
|
||||||
|
|
||||||
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||||
embeddings = inputs_embeds + token_type_embeddings + position_embeddings
|
embeddings = inputs_embeds + token_type_embeddings + position_embeddings
|
||||||
@@ -123,13 +130,6 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
|
|||||||
intermediate_tensors: IntermediateTensors | None = None,
|
intermediate_tensors: IntermediateTensors | None = None,
|
||||||
inputs_embeds: torch.Tensor | None = None,
|
inputs_embeds: torch.Tensor | None = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
# Fix Roberta positions here outside of the CUDA graph.
|
|
||||||
# Because we need the to extract the sequences from
|
|
||||||
# input_ids the control flow is data dependent.
|
|
||||||
replace_roberta_positions(
|
|
||||||
input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx
|
|
||||||
)
|
|
||||||
|
|
||||||
return self.model(
|
return self.model(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
positions=positions,
|
positions=positions,
|
||||||
@@ -324,9 +324,6 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
|
|||||||
inputs_embeds: torch.Tensor | None = None,
|
inputs_embeds: torch.Tensor | None = None,
|
||||||
token_type_ids: torch.Tensor | None = None,
|
token_type_ids: torch.Tensor | None = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
replace_roberta_positions(
|
|
||||||
input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx
|
|
||||||
)
|
|
||||||
if token_type_ids is not None:
|
if token_type_ids is not None:
|
||||||
assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
|
assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
|
||||||
assert input_ids is not None
|
assert input_ids is not None
|
||||||
@@ -337,16 +334,3 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
|
|||||||
inputs_embeds=inputs_embeds,
|
inputs_embeds=inputs_embeds,
|
||||||
intermediate_tensors=intermediate_tensors,
|
intermediate_tensors=intermediate_tensors,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def replace_roberta_positions(
|
|
||||||
input_ids: torch.Tensor, position_ids: torch.Tensor, padding_idx: int
|
|
||||||
) -> None:
|
|
||||||
# Replace position ids because in RoBERTa models
|
|
||||||
# they have to start at padding_idx + 1 and ignore
|
|
||||||
# existing padding tokens
|
|
||||||
# References:
|
|
||||||
# - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
|
|
||||||
# - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
|
|
||||||
# vllm does not use padding tokens, let's make things simpler
|
|
||||||
position_ids += padding_idx + 1
|
|
||||||
|
|||||||
@@ -65,8 +65,10 @@ class LegacyMixin:
|
|||||||
inputs_embeds: torch.Tensor | None = None,
|
inputs_embeds: torch.Tensor | None = None,
|
||||||
) -> torch.Tensor | IntermediateTensors:
|
) -> torch.Tensor | IntermediateTensors:
|
||||||
if self.is_roberta:
|
if self.is_roberta:
|
||||||
# RoBERTa-specific positions padding
|
# RoBERTa positions start at padding_idx + 1.
|
||||||
positions += self.padding_idx + 1
|
# Non-in-place add to avoid mutating the persistent GPU buffer --
|
||||||
|
# in-place += would accumulate on CUDA graph padding slots.
|
||||||
|
positions = positions + self.padding_idx + 1
|
||||||
return super().forward(
|
return super().forward(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
positions=positions,
|
positions=positions,
|
||||||
|
|||||||
Reference in New Issue
Block a user