diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 315bac73f..50d0df589 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -285,7 +285,7 @@ class CompilerManager: with self.compile_context(compile_range): # There is a compilation time optimization here. # - # If the (input metdata, graph, compiler config) are the same, then + # If the (input metadata, graph, compiler config) are the same, then # we want to avoid compiling the same artifact again. If we didn't # do this optimization, the backend compilation (InductorAdaptor or # InductorStandaloneAdaptor) diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 3dd32f5af..50b89eb35 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -420,7 +420,7 @@ def make_fp8_moe_quant_config( per_out_ch_quant: bool = False, ) -> FusedMoEQuantConfig | None: """ - Create FusedMoEQuantConfig for the specifed FP8 Backend. + Create FusedMoEQuantConfig for the specified FP8 Backend. The FusedMoEQuantConfig holds the scales that are used at runtime by the Modular Kernel abstraction. diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index e68d35b31..c0d23964c 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -151,7 +151,7 @@ class DefaultMoERunner(MoERunner): kernels for different parallel execution modes. Eventually, this class will be split up and specialized for different - configurations, e.g. the presense or absence of shared experts, a gate, etc. + configurations, e.g. the presence or absence of shared experts, a gate, etc. """ def __init__( diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 503bcd3d0..fd7050861 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -586,7 +586,7 @@ class GptOssModel(nn.Module): parts = name.split(".") ids = [s for s in parts if s.isdigit()] - # for amd-quark format that each expert is seperated + # for amd-quark format that each expert is separated # need to extract the parameter name with experts fused. # example model: amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8 if len(ids) == 2: diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index fe047e0df..17a0ddd6d 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -567,8 +567,8 @@ def current_stream() -> torch.cuda.Stream: return _current_stream_tls.value -# Global auxilary stream for running operations in background streams. -# We have single global auxilary stream to avoid an explosion of streams +# Global auxiliary stream for running operations in background streams. +# We have single global auxiliary stream to avoid an explosion of streams # for every layer (and make profiling look sane). # # aux_stream() is currently used for: