[Bugfix][CI] fix typos (#34934)

Signed-off-by: 1195343015 <1195343015@qq.com>
Signed-off-by: Jiayi Yan <66017932+1195343015@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Jiayi Yan
2026-03-06 01:05:46 +08:00
committed by GitHub
parent 8c760b6ab6
commit 6a895197fa
98 changed files with 227 additions and 366 deletions

View File

@@ -174,7 +174,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
query_start_loc = query_start_loc[: num_decodes + 1]
block_table_tensor = block_table_tensor[:num_decodes]
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
scheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
num_reqs=num_reqs,
num_heads=self.num_heads,
num_kv_heads=self.num_kv_heads,
@@ -197,7 +197,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
seq_lens=seq_lens,
block_table=block_table_tensor,
slot_mapping=slot_mapping,
scheduler_metadata=sheduler_metadata,
scheduler_metadata=scheduler_metadata,
causal=causal,
use_sdpa_prefill=self.use_sdpa_prefill,
num_decode_tokens=num_decode_tokens,

View File

@@ -383,7 +383,7 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
# Return a tensor of shape (#requests, #max blocks)
state_indices_tensor = common_attn_metadata.block_table_tensor
# Additional cache-related varaiables:
# Additional cache-related variables:
mamba_block_size = self.kv_cache_spec.block_size
(
block_idx_last_computed_token,

View File

@@ -49,14 +49,14 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
# For FP8 sparse attention we have two impelementations:
# For FP8 sparse attention we have two implementations:
# 1. Mixed batch mode: use the FP8 decode kernel for both prefill and decode this is
# done by treating all tokens as single batch.
# 2. Separate prefill and decode mode: use the BF16 prefill kernel for prefill
# (upconverting the FP8 cache to BF16 then calling the prefill kernel) and using
# the FP8 decode kernel for decode.
# Currently we use #1 when the number of heads per rank is low (i.e. TP) since the BF16
# prefill kernel requires padding the numer of heads to 128 while the decode does not
# prefill kernel requires padding the number of heads to 128 while the decode does not
# so when the per ranke head count is below MIN_HEADS_FOR_BF16_PREFILL we use the mixed
# batch mode (#2).
MIN_HEADS_FOR_BF16_PREFILL = 32
@@ -126,7 +126,7 @@ class FlashMLASparseBackend(AttentionBackend):
cache_dtype_str: str = "auto",
) -> tuple[int, ...]:
if cache_dtype_str == "fp8_ds_mla":
# custom storage fromat is 656 bytes
# custom storage format is 656 bytes
# see FlashMLA readme.md for details
return (num_blocks, block_size, 656)
else:

View File

@@ -370,7 +370,7 @@ class AiterFlashAttentionMetadata:
slot_mapping: torch.Tensor
block_table: torch.Tensor
# prefill and deocde split
# prefill and decode split
num_decodes: int
num_decode_tokens: int
num_prefills: int
@@ -1099,7 +1099,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
extend_tokens_slice = slice(
num_decode_tokens, num_decode_tokens + num_extend_tokens
)
extend_querys = query[extend_tokens_slice]
extend_queries = query[extend_tokens_slice]
extend_keys = key[extend_tokens_slice]
extend_values = value[extend_tokens_slice]
extend_outputs = output[extend_tokens_slice]
@@ -1110,7 +1110,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
v_scale = attn_metadata.v_scale
self.extend_forward(
attn_metadata=attn_metadata,
query=extend_querys,
query=extend_queries,
key=extend_keys,
value=extend_values,
key_cache=key_cache,