diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index e9e849b25..512b71284 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -264,7 +264,7 @@ class DefaultMoERunner(MoERunner): ) # Record that the shared_experts_input will be used in the - # shared_experts_stream to to avoid gc issue from + # shared_experts_stream to avoid gc issue from # deallocation. For more details: # https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 # NOTE: We don't need shared_output.record_stream(current_stream()) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index a8be1d61a..322b3a6e8 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -50,7 +50,7 @@ def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor: def rotate_weights_for_fi_trtllm_fp8_per_tensor_moe( gemm1_weights: torch.Tensor, gemm2_weights: torch.Tensor, is_gated_activation: bool ): - """Shuffle weights for for FI TRT-LLM Format""" + """Shuffle weights for FI TRT-LLM Format""" from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a epilogue_tile_m = 128 diff --git a/vllm/model_executor/models/transformers/pooling.py b/vllm/model_executor/models/transformers/pooling.py index 8f3173c33..f4fa4b496 100644 --- a/vllm/model_executor/models/transformers/pooling.py +++ b/vllm/model_executor/models/transformers/pooling.py @@ -57,7 +57,7 @@ class SequenceClassificationMixin(SupportsCrossEncoding, VllmModelForPooling): pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - # Certain information about the the model and classifier can only be + # Certain information about the model and classifier can only be # inferred from the `ForSequenceClassification` class. Therefore, we # instantiate it on the "meta" device to avoid allocating GPU memory. with torch.device("meta"): diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 4e9db1ed2..901021514 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -952,7 +952,7 @@ class OpenCVDynamicOpenPanguVideoBackend(VideoLoader, OpenCVVideoBackendMixin): frame_recovery=frame_recovery, ) - # Use transformers transformers.video_utils.VideoMetadata format + # Use transformers.video_utils.VideoMetadata format metadata = cls.create_hf_metadata( source=source, video_backend="opencv_dynamic", diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index bf460bb79..49b4272ee 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -44,7 +44,7 @@ def maybe_serialize_tool_calls(request: "MistralChatCompletionRequest"): # SEE: https://github.com/vllm-project/vllm/pull/9951 # Credits go to: @gcalmettes # NOTE: There is currently a bug in pydantic where attributes - # declared as iterables are replaced in in the instances by + # declared as iterables are replaced in the instances by # pydantic-core ValidatorIterator instance. In particular, this # affects tool_calls defined in ChatCompletionAssistantMessageParam # model: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a98525cf4..b0e13d609 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1055,6 +1055,6 @@ def init_worker_distributed_environment( parallel_config.decode_context_parallel_size, ) - # Init ec connector here before KV caches caches init + # Init ec connector here before KV caches init # NOTE: We do not init KV caches for Encoder-only instance in EPD disagg mode ensure_ec_transfer_initialized(vllm_config)