[Bugfix] Fix RoBERTa position_ids accumulation on CUDA graph padding (#37884)

This commit is contained in:
Yufeng He
2026-03-23 23:15:12 +08:00
committed by GitHub
parent 7151ae6528
commit ec2280611a
2 changed files with 12 additions and 26 deletions

View File

@@ -65,8 +65,10 @@ class LegacyMixin:
inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor | IntermediateTensors:
if self.is_roberta:
# RoBERTa-specific positions padding
positions += self.padding_idx + 1
# RoBERTa positions start at padding_idx + 1.
# Non-in-place add to avoid mutating the persistent GPU buffer --
# in-place += would accumulate on CUDA graph padding slots.
positions = positions + self.padding_idx + 1
return super().forward(
input_ids=input_ids,
positions=positions,