[Bugfix][CI] fix typos (#34934)
Signed-off-by: 1195343015 <1195343015@qq.com> Signed-off-by: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -174,7 +174,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
|
||||
query_start_loc = query_start_loc[: num_decodes + 1]
|
||||
block_table_tensor = block_table_tensor[:num_decodes]
|
||||
|
||||
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
|
||||
scheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
|
||||
num_reqs=num_reqs,
|
||||
num_heads=self.num_heads,
|
||||
num_kv_heads=self.num_kv_heads,
|
||||
@@ -197,7 +197,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
|
||||
seq_lens=seq_lens,
|
||||
block_table=block_table_tensor,
|
||||
slot_mapping=slot_mapping,
|
||||
scheduler_metadata=sheduler_metadata,
|
||||
scheduler_metadata=scheduler_metadata,
|
||||
causal=causal,
|
||||
use_sdpa_prefill=self.use_sdpa_prefill,
|
||||
num_decode_tokens=num_decode_tokens,
|
||||
|
||||
@@ -383,7 +383,7 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
|
||||
|
||||
# Return a tensor of shape (#requests, #max blocks)
|
||||
state_indices_tensor = common_attn_metadata.block_table_tensor
|
||||
# Additional cache-related varaiables:
|
||||
# Additional cache-related variables:
|
||||
mamba_block_size = self.kv_cache_spec.block_size
|
||||
(
|
||||
block_idx_last_computed_token,
|
||||
|
||||
@@ -49,14 +49,14 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# For FP8 sparse attention we have two impelementations:
|
||||
# For FP8 sparse attention we have two implementations:
|
||||
# 1. Mixed batch mode: use the FP8 decode kernel for both prefill and decode this is
|
||||
# done by treating all tokens as single batch.
|
||||
# 2. Separate prefill and decode mode: use the BF16 prefill kernel for prefill
|
||||
# (upconverting the FP8 cache to BF16 then calling the prefill kernel) and using
|
||||
# the FP8 decode kernel for decode.
|
||||
# Currently we use #1 when the number of heads per rank is low (i.e. TP) since the BF16
|
||||
# prefill kernel requires padding the numer of heads to 128 while the decode does not
|
||||
# prefill kernel requires padding the number of heads to 128 while the decode does not
|
||||
# so when the per ranke head count is below MIN_HEADS_FOR_BF16_PREFILL we use the mixed
|
||||
# batch mode (#2).
|
||||
MIN_HEADS_FOR_BF16_PREFILL = 32
|
||||
@@ -126,7 +126,7 @@ class FlashMLASparseBackend(AttentionBackend):
|
||||
cache_dtype_str: str = "auto",
|
||||
) -> tuple[int, ...]:
|
||||
if cache_dtype_str == "fp8_ds_mla":
|
||||
# custom storage fromat is 656 bytes
|
||||
# custom storage format is 656 bytes
|
||||
# see FlashMLA readme.md for details
|
||||
return (num_blocks, block_size, 656)
|
||||
else:
|
||||
|
||||
@@ -370,7 +370,7 @@ class AiterFlashAttentionMetadata:
|
||||
slot_mapping: torch.Tensor
|
||||
block_table: torch.Tensor
|
||||
|
||||
# prefill and deocde split
|
||||
# prefill and decode split
|
||||
num_decodes: int
|
||||
num_decode_tokens: int
|
||||
num_prefills: int
|
||||
@@ -1099,7 +1099,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
|
||||
extend_tokens_slice = slice(
|
||||
num_decode_tokens, num_decode_tokens + num_extend_tokens
|
||||
)
|
||||
extend_querys = query[extend_tokens_slice]
|
||||
extend_queries = query[extend_tokens_slice]
|
||||
extend_keys = key[extend_tokens_slice]
|
||||
extend_values = value[extend_tokens_slice]
|
||||
extend_outputs = output[extend_tokens_slice]
|
||||
@@ -1110,7 +1110,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
|
||||
v_scale = attn_metadata.v_scale
|
||||
self.extend_forward(
|
||||
attn_metadata=attn_metadata,
|
||||
query=extend_querys,
|
||||
query=extend_queries,
|
||||
key=extend_keys,
|
||||
value=extend_values,
|
||||
key_cache=key_cache,
|
||||
|
||||
Reference in New Issue
Block a user