[Perf] Optimize Preparing Inputs for GPU Model Runner (#16484)
Signed-off-by: snowcharm <snowcharmqq@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -484,14 +484,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
self.input_batch.block_table.commit(num_reqs)
|
self.input_batch.block_table.commit(num_reqs)
|
||||||
|
|
||||||
# Get the number of scheduled tokens for each request.
|
# Get the number of scheduled tokens for each request.
|
||||||
# TODO: The Python loop can be slow. Optimize.
|
req_ids = self.input_batch.req_ids
|
||||||
num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32)
|
tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
|
||||||
max_num_scheduled_tokens = 0
|
num_scheduled_tokens = np.array(tokens, dtype=np.int32)
|
||||||
for i, req_id in enumerate(self.input_batch.req_ids):
|
max_num_scheduled_tokens = max(tokens)
|
||||||
num_tokens = scheduler_output.num_scheduled_tokens[req_id]
|
|
||||||
num_scheduled_tokens[i] = num_tokens
|
|
||||||
max_num_scheduled_tokens = max(max_num_scheduled_tokens,
|
|
||||||
num_tokens)
|
|
||||||
|
|
||||||
# Get request indices.
|
# Get request indices.
|
||||||
# E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
|
# E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
|
||||||
|
|||||||
Reference in New Issue
Block a user