[Models][Qwen3 ViT] Keep max_seqlen on CPU to prevent D2H sync (#37139)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -557,7 +557,6 @@ class Qwen3_VisionTransformer(nn.Module):
|
|||||||
max_seqlen = torch.tensor(
|
max_seqlen = torch.tensor(
|
||||||
MMEncoderAttention.compute_max_seqlen(self.attn_backend, cu_seqlens),
|
MMEncoderAttention.compute_max_seqlen(self.attn_backend, cu_seqlens),
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
device=self.device,
|
|
||||||
)
|
)
|
||||||
cu_seqlens = MMEncoderAttention.maybe_recompute_cu_seqlens(
|
cu_seqlens = MMEncoderAttention.maybe_recompute_cu_seqlens(
|
||||||
self.attn_backend,
|
self.attn_backend,
|
||||||
|
|||||||
Reference in New Issue
Block a user