diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 98bde9d83..df2b713e4 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -56,7 +56,7 @@ def w8a8_block_matmul( Bs: The per-block quantization scale for `B`. block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128]. - output_dytpe: The dtype of the returned tensor. + output_dtype: The dtype of the returned tensor. Returns: torch.Tensor: The result of matmul. diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp index ab8cbbbf4..51bca37e6 100644 --- a/csrc/cpu/cpu_types_vxe.hpp +++ b/csrc/cpu/cpu_types_vxe.hpp @@ -12,7 +12,7 @@ namespace vec_op { #define vec_sub(a, b) ((a) - (b)) #define vec_mul(a, b) ((a) * (b)) #define vec_div(a, b) ((a) / (b)) -#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic +#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left // FIXME: FP16 is not fully supported in Torch-CPU diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp index beeccff78..94b24c2f1 100644 --- a/csrc/cpu/sgl-kernels/moe.cpp +++ b/csrc/cpu/sgl-kernels/moe.cpp @@ -215,7 +215,7 @@ int moe_align_block_size( offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M); } }); - // TODO: do we need to vecterize this ? + // TODO: do we need to vectorize this ? for (int mb = 0; mb < num_token_blocks; ++mb) { offsets[mb + 1] += offsets[mb]; } diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md index 247072d1c..6e92b20d2 100644 --- a/docs/design/multiprocessing.md +++ b/docs/design/multiprocessing.md @@ -8,7 +8,7 @@ page for information on known issues and how to solve them. ## Introduction !!! important - The source code references are to the state of the code at the time of writing in December, 2024. + The source code references are to the state of the code at the time of writing in December 2024. The use of Python multiprocessing in vLLM is complicated by: diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index d8cb208c4..78c768f92 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -901,7 +901,7 @@ def _get_query_key_seq_metadata( attn_metadata.encoder_seq_start_loc, attn_metadata.max_encoder_seq_len) elif attn_type == AttentionType.ENCODER: - # For encoder attention both the query and the key are same i.e the + # For encoder attention both the query and the key are same i.e. the # encoder sequence. return (attn_metadata.encoder_seq_start_loc, attn_metadata.max_encoder_seq_len, diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 32820b026..bf9e87198 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -551,7 +551,7 @@ class RandomDataset(BenchmarkDataset): [6880, 6881] -> ['Ġcalls', 'here'] -> [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] To avoid uncontrolled change of the prompt length, - the encoded sequence is truncated before being decode again. + the encoded sequence is truncated before being decoded again. """ # Build the inner sequence by sampling sequentially from the vocab inner_seq = ((offset + index + np.arange(input_len)) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index c8ecbd28e..4dcb1f3f1 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -242,7 +242,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors], elif processors: raise ValueError( "The `logits_processors` argument is not supported by this " - "server. See --logits-processor-pattern engine argugment " + "server. See --logits-processor-pattern engine argument " "for more information.") return None diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 04ebdbca8..1623a2fd5 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -324,7 +324,7 @@ class MambaMixer2(MambaBase, CustomOp): # - the weight already has a "weight_loader" attribute # which set_weight_attrs will raise if we do not # delete before trying to override it - # - ditto for the otther two weights below + # - ditto for the other two weights below delattr(self.conv1d.bias, "weight_loader") set_weight_attrs( self.conv1d.bias, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 04176c558..9b2d84e32 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1117,7 +1117,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): def _process_multimodal_inputs(self, modalities: dict): # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a17c8783d..6572e421b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2659,7 +2659,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_tokens += num_pad # If cudagraph_mode.decode_mode() == FULL and - # cudagraph_mode.seperate_routine(). This means that we are using + # cudagraph_mode.separate_routine(). This means that we are using # different graphs and/or modes for mixed prefill-decode batches vs. # uniform decode batches. A uniform decode batch means that all # requests have identical query length, except a potential virtual diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py index 81c798685..dfa54d0ad 100644 --- a/vllm/v1/worker/tpu_input_batch.py +++ b/vllm/v1/worker/tpu_input_batch.py @@ -392,7 +392,7 @@ class InputBatch: # NOTE: the following is unsafe # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\ # self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...] - # instead, we need to temporiarily copy the data for one of the indices + # instead, we need to temporarily copy the data for one of the indices # TODO(lucas): optimize this by only copying valid indices tmp = self.token_ids_cpu[i1, ...].copy() self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]