diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 98bde9d83..df2b713e4 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -56,7 +56,7 @@ def w8a8_block_matmul(
         Bs: The per-block quantization scale for `B`.
         block_size: The block size for per-block quantization.
                     It should be 2-dim, e.g., [128, 128].
-        output_dytpe: The dtype of the returned tensor.
+        output_dtype: The dtype of the returned tensor.
 
     Returns:
         torch.Tensor: The result of matmul.
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index ab8cbbbf4..51bca37e6 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -12,7 +12,7 @@ namespace vec_op {
 #define vec_sub(a, b) ((a) - (b))
 #define vec_mul(a, b) ((a) * (b))
 #define vec_div(a, b) ((a) / (b))
-#define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebaic
+#define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebraic
 #define vec_sl(a, b) ((a) << (b))  // Vector Shift Left
 
 // FIXME: FP16 is not fully supported in Torch-CPU
diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp
index beeccff78..94b24c2f1 100644
--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@@ -215,7 +215,7 @@ int moe_align_block_size(
       offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
     }
   });
-  // TODO: do we need to vecterize this ?
+  // TODO: do we need to vectorize this ?
   for (int mb = 0; mb < num_token_blocks; ++mb) {
     offsets[mb + 1] += offsets[mb];
   }
diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md
index 247072d1c..6e92b20d2 100644
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@@ -8,7 +8,7 @@ page for information on known issues and how to solve them.
 ## Introduction
 
 !!! important
-    The source code references are to the state of the code at the time of writing in December, 2024.
+    The source code references are to the state of the code at the time of writing in December 2024.
 
 The use of Python multiprocessing in vLLM is complicated by:
 
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index d8cb208c4..78c768f92 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -901,7 +901,7 @@ def _get_query_key_seq_metadata(
                 attn_metadata.encoder_seq_start_loc,
                 attn_metadata.max_encoder_seq_len)
     elif attn_type == AttentionType.ENCODER:
-        # For encoder attention both the query and the key are same i.e the
+        # For encoder attention both the query and the key are same i.e. the
         # encoder sequence.
         return (attn_metadata.encoder_seq_start_loc,
                 attn_metadata.max_encoder_seq_len,
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 32820b026..bf9e87198 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -551,7 +551,7 @@ class RandomDataset(BenchmarkDataset):
         [6880, 6881] -> ['Ġcalls', 'here'] ->
         [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
         To avoid uncontrolled change of the prompt length,
-        the encoded sequence is truncated before being decode again.
+        the encoded sequence is truncated before being decoded again.
         """
         # Build the inner sequence by sampling sequentially from the vocab
         inner_seq = ((offset + index + np.arange(input_len)) 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index c8ecbd28e..4dcb1f3f1 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -242,7 +242,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
     elif processors:
         raise ValueError(
             "The `logits_processors` argument is not supported by this "
-            "server. See --logits-processor-pattern engine argugment "
+            "server. See --logits-processor-pattern engine argument "
             "for more information.")
     return None
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 04ebdbca8..1623a2fd5 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -324,7 +324,7 @@ class MambaMixer2(MambaBase, CustomOp):
         # - the weight already has a "weight_loader" attribute
         #   which set_weight_attrs will raise if we do not
         #   delete before trying to override it
-        # - ditto for the otther two weights below
+        # - ditto for the other two weights below
         delattr(self.conv1d.bias, "weight_loader")
         set_weight_attrs(
             self.conv1d.bias,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 04176c558..9b2d84e32 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1117,7 +1117,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _process_multimodal_inputs(self, modalities: dict):
         # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image or video).
+        # tensor corresponding to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
         # NOTE: It is important to iterate over the keys in this dictionary
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a17c8783d..6572e421b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2659,7 +2659,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_tokens += num_pad
 
         # If cudagraph_mode.decode_mode() == FULL and
-        # cudagraph_mode.seperate_routine(). This means that we are using
+        # cudagraph_mode.separate_routine(). This means that we are using
         # different graphs and/or modes for mixed prefill-decode batches vs.
         # uniform decode batches. A uniform decode batch means that all
         # requests have identical query length, except a potential virtual
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 81c798685..dfa54d0ad 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -392,7 +392,7 @@ class InputBatch:
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
         #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
-        # instead, we need to temporiarily copy the data for one of the indices
+        # instead, we need to temporarily copy the data for one of the indices
         # TODO(lucas): optimize this by only copying valid indices
         tmp = self.token_ids_cpu[i1, ...].copy()
         self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]