[Revert] Fix performance regression for GLM-4.7-GPTQ decode and MTP acceptance rate (#33771)
Signed-off-by: aabbccddwasd <aabbccddwasd@qq.com>
This commit is contained in:
@@ -919,9 +919,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
# Guard access to seq_lens_cpu, which may not always be needed
|
||||
# and can be expensive to retrieve in async mode.
|
||||
needs_seq_lens_cpu = self.use_dcp or use_cascade or not is_only_trtllm_decode
|
||||
seq_lens_cpu = (
|
||||
common_attn_metadata.seq_lens.cpu() if needs_seq_lens_cpu else None
|
||||
)
|
||||
seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
|
||||
seq_lens_np = seq_lens_cpu.numpy() if seq_lens_cpu is not None else None
|
||||
num_blocks_np = (
|
||||
(seq_lens_np + (page_size - 1)) // page_size
|
||||
|
||||
Reference in New Issue
Block a user