diff --git a/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm/distributed/device_communicators/quick_all_reduce.py index c61231e2d..836241910 100644 --- a/vllm/distributed/device_communicators/quick_all_reduce.py +++ b/vllm/distributed/device_communicators/quick_all_reduce.py @@ -78,7 +78,7 @@ class QuickAllReduce: group: the process group to work on. If None, it will use the default process group. device: the device to bind the CustomAllreduce to. If None, - it will be bind to f"cuda:{local_rank}". + it will be bound to f"cuda:{local_rank}". It is the caller's responsibility to make sure each communicator is bind to a unique device, and all communicators in this group are in the same node. diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py index 46cc1c2f5..8cd8c459a 100644 --- a/vllm/distributed/device_communicators/ray_communicator.py +++ b/vllm/distributed/device_communicators/ray_communicator.py @@ -186,7 +186,7 @@ class RayPPCommunicator(Communicator): """ Receive a torch.Tensor from a peer and synchronize the current stream. - After this call returns, the receive buffer is safe to read from from + After this call returns, the receive buffer is safe to read from any stream. An RayChannelError will be raised if an error occurred (e.g., remote actor died), and the buffer is not safe to read. diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 01551a8c7..fa813550e 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -161,7 +161,7 @@ async def write_local_file(output_path: str, batch_outputs: The list of batch outputs to write. """ # We should make this async, but as long as run_batch runs as a - # standalone program, blocking the event loop won't effect performance. + # standalone program, blocking the event loop won't affect performance. with open(output_path, "w", encoding="utf-8") as f: for o in batch_outputs: print(o.model_dump_json(), file=f) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 6a676cfe1..4c15de303 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -728,7 +728,7 @@ class OpenAIServingResponses(OpenAIServing): for response_msg in request.input: messages.append( parse_response_input(response_msg, prev_outputs)) - # User passes in a a tool call request and its output. We need + # User passes in a tool call request and its output. We need # to add the tool call request to prev_outputs so that the # parse_response_input can find the tool call request when # parsing the tool call output. diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 4b2a15afb..0bdeb2856 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -223,7 +223,7 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): """ # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout + # requested resources are available, and will time out # if they cannot be provisioned. placement_group_specs = current_placement_group.bundle_specs diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 6154fca2e..f4ff875ad 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -116,7 +116,7 @@ def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape): # then we would expand a to: # a = [[1, 1, 2, 2], # [3, 3, 4, 4]] -# NOTE this function this function does not explicitly broadcast dimensions +# NOTE this function does not explicitly broadcast dimensions # with an extent of 1, since this can be done implicitly by pytorch def group_broadcast(t, shape): for i, s in enumerate(shape): diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index edb7f2421..f236040bb 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -185,7 +185,7 @@ _EMBEDDING_MODELS = { "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 # Technically PrithviGeoSpatialMAE is a model that works on images, both in - # input and output. I am adding it here because it piggy-backs on embedding + # input and output. I am adding it here because it piggybacks on embedding # models for the time being. "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"), } diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 56f0f0984..2315f9dad 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -97,7 +97,7 @@ class SamplingMetadataCache: class SamplingMetadata: """Metadata for input sequences. Used in sampler. - The usage is as follow; + The usage is as follows; ``` hidden_states = execute_model(...) logits = hidden_states[sampling_metadata.selected_token_indices] diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index 6f11ab8e0..055f28914 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -269,7 +269,7 @@ class ScalarType: @classmethod def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType': - """Create a unsigned integer scalar type.""" + """Create an unsigned integer scalar type.""" ret = cls(0, size_bits, False, bias if bias else 0) ret.id # noqa B018: make sure the id is cached return ret diff --git a/vllm/sequence.py b/vllm/sequence.py index 4b8e1f464..24114c0bb 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1193,7 +1193,7 @@ class HiddenStates(msgspec.Struct, array_like=True, seq_ids are the sequence ids of each entry of the batch dimension of the hidden_states tensor""" # Scorer hidden states. For prefill step, it is used for hidden states of - # all tokens, whereas for decode step, it use used for last accepted tokens. + # all tokens, whereas for decode step, it is used for last accepted tokens. hidden_states: torch.Tensor # The sequence group metadata list. Only needed for decode step. seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index e07d53ff8..8322fa733 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -815,7 +815,7 @@ class Scheduler(SchedulerInterface): # NOTE: structured_output_request_ids maps # a request's (request that uses structured output) # request_id to its index in the batch. - # This will helps us determine to slice the grammar bitmask + # This will help us determine to slice the grammar bitmask # and only applies valid mask for requests that # uses structured decoding. structured_output_request_ids: dict[str, int] = {} @@ -923,7 +923,7 @@ class Scheduler(SchedulerInterface): request): # NOTE: structured_output_request # should not be None if use_structured_output, we have - # check above, so safe to ignore type warning + # checked above, so safe to ignore type warning request.structured_output_request.grammar.accept_tokens( # type: ignore[union-attr] req_id, new_token_ids) @@ -1242,7 +1242,7 @@ class Scheduler(SchedulerInterface): finished_sending reqs to the output. * if finished_sending: free the blocks # if finished_recving: add to state so we can - scheduler the request during the next step. + schedule the request during the next step. """ if self.connector is not None: diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 9a8046026..95094bda6 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -59,7 +59,7 @@ class RequestStateStats: num_generation_tokens: int = 0 - # This is a engine frontend timestamp (wall-clock) + # This is an engine frontend timestamp (wall-clock) arrival_time: float = 0.0 # These are engine core timestamps (monotonic)