[Model Runner V2] Remove unnecessary copies in PW CUDA graph capture (#34849)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
This commit is contained in:
@@ -218,13 +218,11 @@ class CudaGraphManager:
|
||||
batch_descriptor=batch_descriptor,
|
||||
slot_mapping=slot_mappings,
|
||||
):
|
||||
hidden_states = model(
|
||||
model(
|
||||
input_ids=input_ids,
|
||||
positions=positions,
|
||||
inputs_embeds=inputs_embeds,
|
||||
)
|
||||
assert self.hidden_states is not None
|
||||
self.hidden_states[:num_tokens] = hidden_states
|
||||
|
||||
@torch.inference_mode()
|
||||
def capture(
|
||||
|
||||
Reference in New Issue
Block a user