[Model Runner V2] Remove unused warmup_for_prefill method (#36762)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
This commit is contained in:
@@ -532,13 +532,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
)
|
)
|
||||||
return cuda_graph_size
|
return cuda_graph_size
|
||||||
|
|
||||||
def warmup_for_prefill(self) -> None:
|
|
||||||
# For FlashInfer, we would like to execute a dummy prefill run
|
|
||||||
# to trigger JIT compilation.
|
|
||||||
if all("FLASHINFER" in b.get_name() for b in self.attn_backends.values()):
|
|
||||||
self._dummy_run(self.max_num_tokens, skip_attn=False)
|
|
||||||
torch.accelerator.synchronize()
|
|
||||||
|
|
||||||
def finish_requests(self, scheduler_output: SchedulerOutput) -> None:
|
def finish_requests(self, scheduler_output: SchedulerOutput) -> None:
|
||||||
finished_req_ids = scheduler_output.finished_req_ids
|
finished_req_ids = scheduler_output.finished_req_ids
|
||||||
preempted_req_ids = scheduler_output.preempted_req_ids
|
preempted_req_ids = scheduler_output.preempted_req_ids
|
||||||
|
|||||||
Reference in New Issue
Block a user