diff --git a/.buildkite/scripts/upload-nightly-wheels.sh b/.buildkite/scripts/upload-nightly-wheels.sh index 5efcb89bf..071939df9 100644 --- a/.buildkite/scripts/upload-nightly-wheels.sh +++ b/.buildkite/scripts/upload-nightly-wheels.sh @@ -72,7 +72,7 @@ obj_json="objects.json" aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json" mkdir -p "$INDICES_OUTPUT_DIR" -# call script to generate indicies for all existing wheels +# call script to generate indices for all existing wheels # this indices have relative paths that could work as long as it is next to the wheel directory in s3 # i.e., the wheels are always in s3://vllm-wheels// # and indices can be placed in //, or /nightly/, or // diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 9130026e1..6eda7bce9 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -467,7 +467,7 @@ steps: - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine -# TODO: Add the "V1 Test attetion (MI300)" test group +# TODO: Add the "V1 Test attention (MI300)" test group - label: V1 Test attention (H100) # 10min mirror_hardwares: [amdexperimental, amdproduction] @@ -2174,7 +2174,7 @@ steps: - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine -# TODO: Add the "V1 Test attetion (MI300)" test group +# TODO: Add the "V1 Test attention (MI300)" test group - label: V1 Test attention (H100) # 10min mirror_hardwares: [amdexperimental] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a480eeff0..0ea8ca3c3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: args: [--output-format, github, --fix] - id: ruff-format - repo: https://github.com/crate-ci/typos - rev: v1.38.1 + rev: v1.43.5 hooks: - id: typos args: [--force-exclude] diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py index 6bba93e50..9fa22c8d5 100644 --- a/benchmarks/attention_benchmarks/common.py +++ b/benchmarks/attention_benchmarks/common.py @@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]: max_kv_len = max(r.kv_len for r in requests) if requests else 0 return (batch_size, max_q_len, max_kv_len) except Exception: - # Fallback for unparseable specs + # Fallback for unparsable specs return (0, 0, 0) diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py index 8aaf82197..0dd5c6d84 100644 --- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py @@ -202,7 +202,7 @@ def test_correctness(T: int, N: int): # reference output ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE) - # test ouptut + # test output out_q, out_s = output_from_impl( ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR ) diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp index 8da458b99..1c8644d52 100644 --- a/csrc/cpu/cpu_attn_amx.hpp +++ b/csrc/cpu/cpu_attn_amx.hpp @@ -420,7 +420,7 @@ class AttentionImpl { const int64_t block_size, const int64_t block_size_stride) { // For AMX 2D tiles, size of each line is 64 bytes constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES; - // For AMX B martix, N always is 16 + // For AMX B matrix, N always is 16 constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4; constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t); // For now suppose block_size is divisible by amx_tile_column_num diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 2ea482148..d011ff038 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -4,7 +4,7 @@ #include -// Note: overwrite the external defination for sharing same name between +// Note: overwrite the external definition for sharing same name between // libraries use different ISAs. #define TORCH_EXTENSION_NAME _C diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index e3539ff40..b4b3c793b 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel( int32_t const block_ids_size = sorted_ids_size / block_size; int32_t const SENTINEL = num_batches * max_tokens_per_batch; // To denote invalid entries. - // Intialize sorted_ids + // Initialize sorted_ids for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) { sorted_ids[i] = SENTINEL; } - // Intialize expert_ids with -1 + // Initialize expert_ids with -1 for (size_t i = threadIdx.x; i < block_ids_size; i += stride) { block_ids[i] = -1; } diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index 0c3bcf3b6..c0153bb41 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -542,7 +542,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel( if (!lane_id) { // Store scales. if constexpr (std::is_same::value) { - // Packed UE8MO format. Remove Mantissa. + // Packed UE8M0 format. Remove Mantissa. *y_s_ptr = reinterpret_cast(y_s) >> 7; bool const jump_pack = (current_group_id + 1) % 4 == 0; diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index 19bb324bd..9e776296f 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -1476,7 +1476,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) #endif // B[] staging is cooperative across GrpsShrB, so sync here before reading - // back. This wait is currently inserted by compiler, but not gauranteed. + // back. This wait is currently inserted by compiler, but not guaranteed. asm volatile("s_waitcnt 0"); __syncthreads(); diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index b27c8d34e..6f6fb2493 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -98,7 +98,7 @@ The goal of this structure is to uniquely identify a (padded) batch with minimal ### `CudagraphDispatcher` -The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWarpper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher. +The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWrapper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher. The dispatching keys are initialized through the dispatcher's `initialize_cudagraph_keys` method, which is called by the gpu_model_runner after all possible attention backends are initialized. This is where we can get much fancier in the future and “prepare” all kinds of CUDA Graphs combinations. For now, we just append available keys based on the valid combos of `decode_mode`/`mixed_mode` of `cudagraph_mode` and `cudagraph_capture_sizes` in the compilation config. diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index 7f356262b..090bb729b 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -47,7 +47,7 @@ The TopK Weight Application and Reduction components happen right after the Unpe Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py). `FusedMoEPrepareAndFinalizeModular::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method. -The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens. +The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPrepareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens. * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEExpertsModular` implementation does the weight application and reduction itself. * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEExpertsModular` implementation needs the `FusedMoEPrepareAndFinalizeModular::finalize()` to do the weight application and reduction. diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index af1d7b6bb..980001156 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -352,7 +352,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, (s, d, UNIDIRECTIONAL or SWAP) ``` - * If the Move specifies `UNIDRECTIONAL`: + * If the Move specifies `UNIDIRECTIONAL`: * The request at index `s` is moved to index `d`; index `s` becomes an empty slot diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 1f491a3a4..d674f7740 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -141,7 +141,7 @@ Every plugin has three parts: - triton ops Custom way doesn't work for triton ops now. -7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc. +7. (optional) Implement other pluggable modules, such as lora, graph backend, quantization, mamba attention backend, etc. ## Compatibility Guarantee diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index d43557a29..b53f0fad2 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -641,7 +641,7 @@ Then you obtain the sparse embeddings like this: curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ "model": "BAAI/bge-m3", "task": "token_classify", - "input": ["What is BGE M3?", "Defination of BM25"] + "input": ["What is BGE M3?", "Definition of BM25"] }' ``` @@ -657,7 +657,7 @@ You can obtain the colbert embeddings like this: curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ "model": "BAAI/bge-m3", "task": "token_embed", - "input": ["What is BGE M3?", "Defination of BM25"] + "input": ["What is BGE M3?", "Definition of BM25"] }' ``` diff --git a/examples/online_serving/dashboards/grafana/query_statistics.json b/examples/online_serving/dashboards/grafana/query_statistics.json index 880f6c5d7..e40ee276c 100644 --- a/examples/online_serving/dashboards/grafana/query_statistics.json +++ b/examples/online_serving/dashboards/grafana/query_statistics.json @@ -349,7 +349,7 @@ "defaults": { "color": { "mode": "thresholds" }, "mappings": [ - { "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" } + { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" } ], "thresholds": { "mode": "absolute", diff --git a/pyproject.toml b/pyproject.toml index b4b9334f8..d4fb554d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,193 +124,54 @@ python = "./.venv" [tool.typos.files] # these files may be written in non english words -extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", - "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*", - "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", - "docs/governance/process.md"] -ignore-hidden = true -ignore-files = true -ignore-dot = true -ignore-vcs = true -ignore-global = true -ignore-parent = true +extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", "tests/tokenizers_/*", + "benchmarks/sonnet.txt", "tests/lora/data/*", "examples/pooling/token_embed/*", "build/*", + "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/test_transcription_validation.py", + "docs/governance/process.md", "tests/v1/engine/test_fast_incdec_prefix_err.py", ".git/*"] +ignore-hidden = false [tool.typos.default] -binary = false -check-filename = false -check-file = true -unicode = true -ignore-hex = true -identifier-leading-digits = false -locale = "en" -extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw", - ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", - ".*[Tt]h[rR].*"] -extend-ignore-words-re = [] -extend-ignore-re = [] +extend-ignore-identifiers-re = [".*[Uu][Ee][0-9][Mm][0-9].*"] [tool.typos.default.extend-identifiers] bbc5b7ede = "bbc5b7ede" -womens_doubles = "womens_doubles" -v_2nd = "v_2nd" -# splitted_input = "splitted_input" NOOPs = "NOOPs" -typ = "typ" nin_shortcut = "nin_shortcut" -UperNetDecoder = "UperNetDecoder" -subtile = "subtile" cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin" -SFOuput = "SFOuput" -# huggingface transformers repo uses these words + depthwise_seperable_out_channel = "depthwise_seperable_out_channel" -DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d" -depthwise_seperable_CNN = "depthwise_seperable_CNN" +pard_token = "pard_token" +ptd_token_id = "ptd_token_id" +ser_de = "ser_de" +shared_memory_per_block_optin = "shared_memory_per_block_optin" +FoPE = "FoPE" +k_ot = "k_ot" +view_seperator = "view_seperator" +inverse_std_variences = "inverse_std_variences" [tool.typos.default.extend-words] iy = "iy" -tendencias = "tendencias" indx = "indx" # intel cpu features tme = "tme" dout = "dout" Pn = "Pn" arange = "arange" +thw = "thw" +subtile = "subtile" +HSA = "HSA" +setp = "setp" +CPY = "CPY" +thr = "thr" +Thr = "Thr" PARD = "PARD" pard = "pard" AKS = "AKS" - -[tool.typos.type.py] -extend-glob = [] -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.py.extend-identifiers] -arange = "arange" -NDArray = "NDArray" -EOFError = "EOFError" +ba = "ba" fo = "fo" -ba = "ba" - -[tool.typos.type.py.extend-words] -ba = "ba" nd = "nd" - -[tool.typos.type.cpp] -extend-glob = ["*.cu"] -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.cpp.extend-identifiers] -countr_one = "countr_one" -k_ot = "k_ot" -ot = "ot" - -[tool.typos.type.cpp.extend-words] - -[tool.typos.type.rust] -extend-glob = [] -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.rust.extend-identifiers] -flate2 = "flate2" - -[tool.typos.type.rust.extend-words] -ser = "ser" - -[tool.typos.type.lock] -extend-glob = [] -check-file = false -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.lock.extend-identifiers] - -[tool.typos.type.lock.extend-words] - -[tool.typos.type.jl] -extend-glob = [] -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.jl.extend-identifiers] - -[tool.typos.type.jl.extend-words] -modul = "modul" -egals = "egals" -usig = "usig" -egal = "egal" - -[tool.typos.type.go] -extend-glob = [] -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.go.extend-identifiers] -flate = "flate" - -[tool.typos.type.go.extend-words] - -[tool.typos.type.css] -extend-glob = [] -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.css.extend-identifiers] -nd = "nd" - -[tool.typos.type.css.extend-words] - -[tool.typos.type.man] -extend-glob = [] -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.man.extend-identifiers] -Nd = "Nd" - -[tool.typos.type.man.extend-words] - -[tool.typos.type.cert] -extend-glob = [] -check-file = false -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.cert.extend-identifiers] - -[tool.typos.type.cert.extend-words] - -[tool.typos.type.sh] -extend-glob = [] -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.sh.extend-identifiers] -ot = "ot" - -[tool.typos.type.sh.extend-words] - -[tool.typos.type.vimscript] -extend-glob = [] -extend-ignore-identifiers-re = [] -extend-ignore-words-re = [] -extend-ignore-re = [] - -[tool.typos.type.vimscript.extend-identifiers] -windo = "windo" - -[tool.typos.type.vimscript.extend-words] +eles = "eles" +datas = "datas" [tool.uv] no-build-isolation-package = ["torch"] diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py index 1850cc8f1..6763a6dff 100644 --- a/tests/compile/test_decorator.py +++ b/tests/compile/test_decorator.py @@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch expected_num_backend_compilations = 4 # A has support_torch_compile but enable_if fn returns False - # enalbe_if will be True for B, so we expect mod1 and mod2 + # enable_if will be True for B, so we expect mod1 and mod2 # to be compiled with compilation_counter.expect( num_graphs_seen=2, diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index 356cac7af..5e0755ff7 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch): f"Expected {expected1}, got {result1}" ) - # Second call should triger another compilation + # Second call should trigger another compilation x2 = torch.tensor([1, 2, 3]) result2 = wrapper(x2) expected2 = torch.tensor([100, 200, 300]) diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index e3b612123..a14b80b32 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -444,7 +444,7 @@ def ref_multi_query_kv_attention( @pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention]) -def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None: +def test_num_heads_not_divisible_by_num_kv_heads(attention_cls: type) -> None: head_size = 64 scale = float(1.0 / (head_size**0.5)) num_heads = 16 diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index cac22a185..53aed1032 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -162,7 +162,7 @@ Ns = [1024] TOPKs = [4, 1] Es = [32] DTYPEs = [torch.bfloat16] -FUSED_MOE_CHUNK_SIZEs = [None, 16] +FUSED_MOE_CHUNK_SIZES = [None, 16] def is_nyi_config(config: Config) -> bool: @@ -192,7 +192,7 @@ def generate_valid_test_cases( DTYPEs, MK_QUANT_CONFIGS, product(prepare_finalize_types, MK_FUSED_EXPERT_TYPES), - FUSED_MOE_CHUNK_SIZEs, + FUSED_MOE_CHUNK_SIZES, ): total = total + 1 @@ -266,7 +266,7 @@ def test_modular_kernel_combinations_multigpu( if cuda_device_count_stateless() < world_size: pytest.skip( f"Not enough GPUs available to run, got " - f"{cuda_device_count_stateless()} exepected " + f"{cuda_device_count_stateless()} expected " f"{world_size}." ) diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index 0ef4ba257..bc85d6f72 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -87,7 +87,7 @@ MSGS = [ { "role": "user", "content": "Could you please rewrite the below article? \n\n My English needs " - "improvving, maybe I make errors.", + "improving, maybe I make errors.", }, { "role": "assistant", @@ -98,7 +98,7 @@ MSGS = [ "type": "function", "function": { "name": "rewrite", - "arguments": '{"text":"My English needs improvving, maybe ' + "arguments": '{"text":"My English needs improving, maybe ' 'I make errors."}', }, } diff --git a/tests/models/language/pooling/test_bge_m3.py b/tests/models/language/pooling/test_bge_m3.py index 2c0c0de34..80ed4eb47 100644 --- a/tests/models/language/pooling/test_bge_m3.py +++ b/tests/models/language/pooling/test_bge_m3.py @@ -14,7 +14,7 @@ MAX_MODEL_LEN = 512 # Example from https://huggingface.co/BAAI/bge-m3 -sentences_1 = ["What is BGE M3?", "Defination of BM25"] +sentences_1 = ["What is BGE M3?", "Definition of BM25"] sentences_2 = [ "BGE M3 is an embedding model supporting dense retrieval, " "lexical matching and multi-vector interaction.", diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index a48644e6b..311c78545 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -719,7 +719,7 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner: # Convert to tuple or None all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None - # Include hiden_states for compatibility with hidden_states_to_seq_logprobs() + # Include hidden_states for compatibility with hidden_states_to_seq_logprobs() return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=past_key_values, @@ -1226,7 +1226,7 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner": dicts (accepting ``url``, ``path``, or ``base64`` audio) rather than the standard ``processor(text=, audio=, sampling_rate=)`` interface. 2. HfRunner.get_inputs cannot handle multi-audio per prompt because it - mis-unpacks ``[(arr1, sr1), (arr2, sr2)]`` via a ``len == 2`` check. + incorrectly unpacks ``[(arr1, sr1), (arr2, sr2)]`` via a ``len == 2`` check. We override ``get_inputs`` to build conversation dicts and call ``apply_chat_template`` directly, bypassing both issues. We also wrap diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py index fe44017a0..3af08e026 100644 --- a/tests/quantization/test_blackwell_moe.py +++ b/tests/quantization/test_blackwell_moe.py @@ -25,7 +25,7 @@ def set_test_environment(): os.environ["FLASHINFER_NVCC_THREADS"] = "16" -# Overide the backbone layers to 4 for faster startup +# Override the backbone layers to 4 for faster startup HF_OVERRIDE_TEXT = { "num_layers": 4, "num_hidden_layers": 4, diff --git a/tests/renderers/test_hf.py b/tests/renderers/test_hf.py index b6afcc559..236557ddf 100644 --- a/tests/renderers/test_hf.py +++ b/tests/renderers/test_hf.py @@ -206,8 +206,8 @@ def test_resolve_chat_template_kwargs(sample_json_schema, model, expected_kwargs chat_template_kwargs = { # both unused - "unsed_kwargs_1": 123, - "unsed_kwargs_2": "abc", + "unused_kwargs_1": 123, + "unused_kwargs_2": "abc", # should not appear "chat_template": "{% Hello world! %}", "tokenize": True, diff --git a/tests/test_config.py b/tests/test_config.py index 0abfef76f..f98b30f99 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -853,7 +853,7 @@ def test_vllm_config_defaults_are_none(): @pytest.mark.parametrize( - ("model_id", "compiliation_config", "optimization_level"), + ("model_id", "compilation_config", "optimization_level"), [ ( None, @@ -895,7 +895,7 @@ def test_vllm_config_defaults_are_none(): ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O3), ], ) -def test_vllm_config_defaults(model_id, compiliation_config, optimization_level): +def test_vllm_config_defaults(model_id, compilation_config, optimization_level): """Test that optimization-level defaults are correctly applied.""" model_config = None @@ -903,12 +903,12 @@ def test_vllm_config_defaults(model_id, compiliation_config, optimization_level) model_config = ModelConfig(model_id) vllm_config = VllmConfig( model_config=model_config, - compilation_config=compiliation_config, + compilation_config=compilation_config, optimization_level=optimization_level, ) else: vllm_config = VllmConfig( - compilation_config=compiliation_config, + compilation_config=compilation_config, optimization_level=optimization_level, ) # Use the global optimization level defaults diff --git a/tests/tool_parsers/test_seed_oss_tool_parser.py b/tests/tool_parsers/test_seed_oss_tool_parser.py index 88cc736f6..87e71a12f 100644 --- a/tests/tool_parsers/test_seed_oss_tool_parser.py +++ b/tests/tool_parsers/test_seed_oss_tool_parser.py @@ -106,7 +106,7 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser): @pytest.mark.parametrize( ids=[ "tool_call_0_thinking_budget", - "tool_call_512_thinkg_budget", + "tool_call_512_thinking_budget", "tool_call_unlimited_thinking_budget", ], argnames=["model_output", "expected_tool_calls", "expected_content"], @@ -308,7 +308,7 @@ def stream_delta_message_generator( @pytest.mark.parametrize( ids=[ "tool_call_0_thinking_budget", - "tool_call_512_thinkg_budget", + "tool_call_512_thinking_budget", "tool_call_unlimited_thinking_budget", ], argnames=["model_output", "expected_tool_calls", "expected_content"], diff --git a/tests/transformers_utils/test_repo_utils.py b/tests/transformers_utils/test_repo_utils.py index e17e3de84..6da4256cb 100644 --- a/tests/transformers_utils/test_repo_utils.py +++ b/tests/transformers_utils/test_repo_utils.py @@ -34,10 +34,10 @@ def test_list_filtered_repo_files( subfolder.mkdir() (path_tmp_dir / "json_file.json").touch() (path_tmp_dir / "correct_2.txt").touch() - (path_tmp_dir / "uncorrect.txt").touch() - (path_tmp_dir / "uncorrect.jpeg").touch() + (path_tmp_dir / "incorrect.txt").touch() + (path_tmp_dir / "incorrect.jpeg").touch() (subfolder / "correct.txt").touch() - (subfolder / "uncorrect_sub.txt").touch() + (subfolder / "incorrect_sub.txt").touch() def _glob_path() -> list[str]: return [ @@ -86,7 +86,7 @@ def test_one_filtered_repo_files(allow_patterns: list[str], expected_bool: bool) path_tmp_dir = Path(tmp_dir) subfolder = path_tmp_dir / "subfolder" subfolder.mkdir() - (path_tmp_dir / "uncorrect.jpeg").touch() + (path_tmp_dir / "incorrect.jpeg").touch() (subfolder / "correct.txt").touch() def _glob_path() -> list[str]: diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index c609bc1b8..2c4dab3f8 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -308,7 +308,7 @@ def test_free_kv_cache_block_queue_append_n(): # Create an empty FreeKVCacheBlockQueue invalid_queue = FreeKVCacheBlockQueue([]) - # set prev_free_block to None and this will cause assertation in append_n + # set prev_free_block to None and this will cause assertion in append_n invalid_queue.fake_free_list_tail.prev_free_block = None with pytest.raises(AssertionError): # Append 1 block diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 182ed0f27..28355eb54 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -2304,22 +2304,22 @@ def test_block_lookup_cache_single_block_per_key(): assert cache.get_one_block(key0) is block0 assert cache.get_one_block(key1) is block1 assert cache.get_one_block(key2) is None - # No block poped due to block_id mismatch + # No block popped due to block_id mismatch assert cache.pop(key0, 100) is None assert cache.get_one_block(key0) is block0 assert cache.get_one_block(key1) is block1 assert cache.get_one_block(key2) is None - # block poped with (key0, block ID 0) + # block popped with (key0, block ID 0) assert cache.pop(key0, 0) is block0 assert cache.get_one_block(key0) is None assert cache.get_one_block(key1) is block1 assert cache.get_one_block(key2) is None - # No block poped due to block_id mismatch + # No block popped due to block_id mismatch assert cache.pop(key0, 1) is None assert cache.get_one_block(key0) is None assert cache.get_one_block(key1) is block1 assert cache.get_one_block(key2) is None - # block poped with (key1, block ID 1) + # block popped with (key1, block ID 1) assert cache.pop(key1, 1) is block1 assert cache.get_one_block(key0) is None assert cache.get_one_block(key1) is None diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py index 1d03bd104..6fbe0e350 100644 --- a/tests/v1/core/test_priority_scheduler_random.py +++ b/tests/v1/core/test_priority_scheduler_random.py @@ -140,7 +140,7 @@ def _mock_draft_token_ids( return DraftTokenIds(req_ids=request_ids, draft_token_ids=sampled_token_ids) -def _chech_valid_scheduler_output( +def _check_valid_scheduler_output( scheduler_output: SchedulerOutput, seen_request_ids: set[str], seen_mm_hashes: set[str], @@ -242,7 +242,7 @@ def test_priority_scheduling_blast( ) scheduler.add_request(req) scheduler_output = scheduler.schedule() - _chech_valid_scheduler_output( + _check_valid_scheduler_output( scheduler_output, seen_request_ids, seen_mm_hashes ) model_output = _mock_execute_model( diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index fdd10182a..24edfadb9 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1116,7 +1116,7 @@ def _step_until_done( def _step_until_kv_transfer_finished(scheduler: Scheduler, req_ids: list[str]): - """Cycle requests through a KV transfer cyle.""" + """Cycle requests through a KV transfer cycle.""" # Requests should first transition to WAITING_FOR_REMOTE_KVS output = scheduler.schedule() @@ -2714,7 +2714,7 @@ def _assert_right_encoder_inputs( if expected_total_reqs == 0: return - # Number of expected enocder inputs should match number of requests + # Number of expected encoder inputs should match number of requests if expected_encoder_inputs: assert check_exist and requests is not None # only support expect input exist assert len(requests) == len(expected_encoder_inputs) @@ -2964,7 +2964,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector): ) scheduler.update_from_output(output, model_output) - # request1 is finished after outputing 1 token + # request1 is finished after outputting 1 token # Finish request scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED) @@ -3060,14 +3060,14 @@ def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector): for request in requests: scheduler.add_request(request) - # Set up to test different encoder cache exsistence scenario after preemption + # Set up to test different encoder cache existence scenario after preemption # Order of getting encoder cache should be: local cache -> connector-> compute scheduler.ec_connector.update_state_after_alloc = Mock( wraps=scheduler.ec_connector.update_state_after_alloc ) if cache_exist == "local": - # Allocate cache to cache manager manually to mimick + # Allocate cache to cache manager manually to mimic for req in requests: scheduler.encoder_cache_manager.allocate(req, 0) else: @@ -3384,13 +3384,13 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption( pooler_output=[], ) # Finish the requests to make room for the preempted requests to resume - # req_high is finished after outputing 2 tokens + # req_high is finished after outputting 2 tokens scheduler.update_from_output(output, model_output) scheduler.finish_requests( request_high.request_id, RequestStatus.FINISHED_LENGTH_CAPPED ) - # Set up to test different encoder cache exsistence scenario after preemption + # Set up to test different encoder cache existence scenario after preemption # Order of getting encoder cache should be: local cache -> connector-> compute # By default, the cache should still exist in local in this test case if cache_exist != "local": @@ -3483,7 +3483,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto ec_role="ec_consumer", ) - # Limit the number of availiable slots of EncoderCacheManager + # Limit the number of available slots of EncoderCacheManager scheduler.encoder_cache_manager = EncoderCacheManager(cache_size=32) # Create MM request1 @@ -3574,7 +3574,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto ) scheduler.update_from_output(output, model_output) - # request1 is finished after outputing 1 token + # request1 is finished after outputting 1 token # Finish request scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED) assert scheduler.get_num_unfinished_requests() == 1 diff --git a/tests/v1/e2e/test_mamba_prefix_cache.py b/tests/v1/e2e/test_mamba_prefix_cache.py index 3ba7651c3..d69088772 100644 --- a/tests/v1/e2e/test_mamba_prefix_cache.py +++ b/tests/v1/e2e/test_mamba_prefix_cache.py @@ -76,11 +76,11 @@ def get_fake_sample_fn() -> SamplerOutput: ), logprobs_tensors=None, ) - accpeted_tokens = prompt_token_ids[ + accepted_tokens = prompt_token_ids[ first_token_id_index : first_token_id_index + min(num_accepted_tokens, logits.shape[0]) ] - sampled_token_ids = accpeted_tokens + sampled_token_ids = accepted_tokens return SamplerOutput( sampled_token_ids=torch.tensor( [sampled_token_ids], device="cuda", dtype=torch.int32 diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index aa084eee8..70c6d250b 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -911,7 +911,7 @@ def test_structured_output_with_structural_tag(backend: str): ), ) - prompt = "Hello and repete hello 10 times, do not say anything else. Only say hello hello hello, now start" + prompt = "Hello and repeat hello 10 times, do not say anything else. Only say hello hello hello, now start" outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=True) assert outputs is not None for output in outputs: diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecycle.py similarity index 100% rename from tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py rename to tests/v1/kv_connector/unit/test_kv_connector_lifecycle.py diff --git a/tests/v1/kv_connector/unit/test_moriio_connector.py b/tests/v1/kv_connector/unit/test_moriio_connector.py index 17d951b91..7aa824609 100644 --- a/tests/v1/kv_connector/unit/test_moriio_connector.py +++ b/tests/v1/kv_connector/unit/test_moriio_connector.py @@ -99,7 +99,7 @@ def _setup_kv_transfer_request(request, remote_host="127.0.0.1", fake_port=4789) return request -class FakeMorIIOWrapper: +class FakeMoRIIOWrapper: # A fake MoRIIOWrapper for testing purposes def __init__(self, *args, **kwargs): pass @@ -168,7 +168,7 @@ class FakeMorIIOWrapper: pass -class FakeMorIIOConnectorWorker(MoRIIOConnectorWorker): +class FakeMoRIIOConnectorWorker(MoRIIOConnectorWorker): # Define a fake remote engine id for testing REMOTE_ENGINE_ID = "remote_engine" @@ -373,7 +373,7 @@ def test_read_mode_loads_remote_block_ids(moriio_read_mode): # Set remote block ids to be fetched. request.kv_transfer_params["remote_block_ids"] = block_list - # Remote Prefill, triggers MorIIOConnectorMetadata. + # Remote Prefill, triggers MoRIIOConnectorMetadata. scheduler_output = scheduler.schedule() kv_connector_metadata = scheduler_output.kv_connector_metadata @@ -451,7 +451,7 @@ def test_register_kv_caches(mock_parallel_groups): with set_current_vllm_config(vllm_config): connector = MoRIIOConnector(vllm_config, KVConnectorRole.WORKER) - connector.connector_worker = FakeMorIIOConnectorWorker( + connector.connector_worker = FakeMoRIIOConnectorWorker( vllm_config, connector.engine_id, hand_shake_latency=0 ) @@ -528,7 +528,7 @@ def test_moriio_handshake_returns_metadata(mock_parallel_groups): with ( patch( "vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine.MoRIIOWrapper", - FakeMorIIOWrapper, + FakeMoRIIOWrapper, ), ): handshake_port = _find_free_port() diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 1975d2226..15ca74db3 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -460,9 +460,9 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): # When remote tp_size > local tp_size, handshake with multiple # remote ranks. - num_hanshakes = 1 if tp_ratio > 0 else -tp_ratio + num_handshakes = 1 if tp_ratio > 0 else -tp_ratio remote_agents: dict[int, str] = {} - for remote_tp_rank in range(num_hanshakes): + for remote_tp_rank in range(num_handshakes): remote_agent_name = self.add_remote_agent( NixlAgentMetadata( engine_id=self.REMOTE_ENGINE_ID, @@ -688,7 +688,7 @@ class TestNixlHandshake: ) check_handshake(2) - # NOTE flexiblity: a second remote with higher number of ranks is + # NOTE flexibility: a second remote with higher number of ranks is # discovered. This is not a scenario we actively support right now, but # the connector allows it. worker.REMOTE_ENGINE_ID = "remote_engine_2" @@ -1766,7 +1766,7 @@ def test_aborted_request_removed_from_worker_in_batch(default_vllm_config, dist_ req = create_request(request_id=1, do_remote_decode=True, max_tokens=1) scheduler.add_request(req) - # First scheduling pass - examinate build_connector_meta output + # First scheduling pass - examine build_connector_meta output sched_out = scheduler.schedule() kv_meta = sched_out.kv_connector_metadata assert kv_meta is not None diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 3a83f835c..df2fac85e 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -36,7 +36,7 @@ SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT # non-associative and sensitive to batch geometry. The ref LLM (no spec # decode, default scheduling) and the spec-decode LLM (chunked prefill, # different effective batch sizes) follow different reduction orders, -# producing numerically divergent logprobs that get mis-attributed to +# producing numerically divergent logprobs that get misattributed to # spec-decode incorrectness. # # Force LLM instances into an identical, deterministic execution diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index 38ffc58e2..552a27fe2 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -726,7 +726,7 @@ def test_frequency_penalties(rejection_sampler): spec_tokens = [[1, 1, 1], [], [1, 1, 1]] output_tokens = [[1, 1, 1, 1], [7], [1, 1, 1, 1]] # 1, 7 and 1 are the bonus tokens - num_requsts = len(spec_tokens) + num_requests = len(spec_tokens) logits = create_logits_tensor(output_tokens, token_idx_to_override=15) metadata = create_sampling_metadata( all_greedy=True, @@ -734,8 +734,8 @@ def test_frequency_penalties(rejection_sampler): spec_token_ids=spec_tokens, prompt_token_ids=torch.tensor([[5, 6, 7], [6, 7, 8], [7, 8, 9]], device=DEVICE), frequency_penalties=[1.5, 1.5, 0.7], - presence_penalties=[0.0] * num_requsts, - repetition_penalties=[1.0] * num_requsts, + presence_penalties=[0.0] * num_requests, + repetition_penalties=[1.0] * num_requests, ) bonus_token_tensor = torch.tensor( [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 45e016d1a..e03a4c149 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -3106,7 +3106,7 @@ def cpu_attn_get_scheduler_metadata( isa: str, enable_kv_split: bool, ) -> torch.Tensor: - sheduler_metadata = torch.ops._C.get_scheduler_metadata( + scheduler_metadata = torch.ops._C.get_scheduler_metadata( num_reqs, num_heads, num_kv_heads, @@ -3119,7 +3119,7 @@ def cpu_attn_get_scheduler_metadata( isa, enable_kv_split, ) - return sheduler_metadata + return scheduler_metadata def cpu_attn_reshape_and_cache( diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 8f3808166..c46460959 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -872,7 +872,7 @@ class CompilationConfig: ) # Currently only eager and inductor backend are supported. - # for piecewise compilation. Custom backends are not suppported for + # for piecewise compilation. Custom backends are not supported for # piecewise compilation. Update when more backends are supported. if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [ "", diff --git a/vllm/config/observability.py b/vllm/config/observability.py index 7293cf11c..84e83c6d4 100644 --- a/vllm/config/observability.py +++ b/vllm/config/observability.py @@ -59,7 +59,7 @@ class ObservabilityConfig: enable_layerwise_nvtx_tracing: bool = False """Enable layerwise NVTX tracing. This traces the execution of each layer or - module in the model and attach informations such as input/output shapes to + module in the model and attach information such as input/output shapes to nvtx range markers. Noted that this doesn't work with CUDA graphs enabled.""" enable_mfu_metrics: bool = False diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index fd5e3b464..4df1015c0 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -592,7 +592,7 @@ class VllmConfig: If the user configuration does not specify a value for a default field and if the default field is still None after all user selections are - applied, then default values will be applied to the field. User speciied + applied, then default values will be applied to the field. User specified fields will not be overridden by the default. Args: diff --git a/vllm/distributed/eplb/policy/default.py b/vllm/distributed/eplb/policy/default.py index b9cfcae01..1154f98ec 100644 --- a/vllm/distributed/eplb/policy/default.py +++ b/vllm/distributed/eplb/policy/default.py @@ -44,7 +44,7 @@ class DefaultEplbPolicy(AbstractEplbPolicy): rank_in_pack = np.zeros_like(pack_index, dtype=np.int64) return pack_index, rank_in_pack - # Sort and get indices in decending order + # Sort and get indices in descending order indices = np.argsort(-weight, axis=-1) pack_index = np.full((num_layers, num_groups), -1, dtype=np.int64) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index c0968272f..3d9027adf 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -129,7 +129,7 @@ class KVConnectorRole(enum.Enum): class KVConnectorHandshakeMetadata(ABC): # noqa: B024 """ Metadata used for out of band connector handshake between - P/D workers. This needs to serializeable. + P/D workers. This needs to serializable. """ pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index ee475e16a..51af1958b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -398,7 +398,7 @@ class ReqMeta: ) -def need_gpu_interm_buffer(lmcache_config: LMCacheEngineConfig): +def need_gpu_interim_buffer(lmcache_config: LMCacheEngineConfig): return not lmcache_config.enable_pd @@ -497,7 +497,7 @@ def _init_lmcache_engine( use_mla, ) - use_gpu = need_gpu_interm_buffer(lmcache_config) + use_gpu = need_gpu_interim_buffer(lmcache_config) vllm_gpu_connector: ( VLLMBufferLayerwiseGPUConnector | VLLMPagedMemGPUConnectorV2 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py index f105d3492..d986f6866 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py @@ -481,7 +481,7 @@ class MooncakeConnectorWorker: ) self._remote_agents: dict[EngineId, dict[int, dict[int, str]]] = {} - self._pending_bootstrap_querys: dict[str, asyncio.Event] = {} + self._pending_bootstrap_queries: dict[str, asyncio.Event] = {} self.side_channel_port: int = 0 # we will bind it in register_kv_caches() self.engine_id: EngineId = engine_id self.tp_rank = get_tensor_model_parallel_rank() @@ -1077,7 +1077,7 @@ class MooncakeConnectorWorker: response = self._xfer_resp_decoder.decode(ret_msg) if response.status == MooncakeXferResponseStatus.ERROR: logger.error( - "Error happens during tranfering kvcache for %s: %s", + "Error happens during transferring kvcache for %s: %s", req_ids, response.err_msg, ) @@ -1140,8 +1140,8 @@ class MooncakeConnectorWorker: ) # Always notify others regardless of connection success or failure. - self._pending_bootstrap_querys[remote_bootstrap_addr].set() - del self._pending_bootstrap_querys[remote_bootstrap_addr] + self._pending_bootstrap_queries[remote_bootstrap_addr].set() + del self._pending_bootstrap_queries[remote_bootstrap_addr] def receive_kv( self, @@ -1171,11 +1171,11 @@ class MooncakeConnectorWorker: pull_metas: dict[ReqId, PullReqMeta], ): remote_bootstrap_addr = next(iter(pull_metas.values())).remote_bootstrap_addr - if remote_bootstrap_addr not in self._pending_bootstrap_querys: - self._pending_bootstrap_querys[remote_bootstrap_addr] = asyncio.Event() + if remote_bootstrap_addr not in self._pending_bootstrap_queries: + self._pending_bootstrap_queries[remote_bootstrap_addr] = asyncio.Event() await self._connect_to_prefiller_bootstrap(remote_bootstrap_addr) else: - await self._pending_bootstrap_querys[remote_bootstrap_addr].wait() + await self._pending_bootstrap_queries[remote_bootstrap_addr].wait() if remote_engine_id not in self._remote_agents: logger.error( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index fd99c1a74..0c467fa14 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -720,7 +720,7 @@ class OffloadPromMetrics(KVConnectorPromMetrics): per_engine_labelvalues: dict[int, list[object]], ): super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues) - # (engine_idx, transfer_tupe) -> (metric with bounded labels) + # (engine_idx, transfer_type) -> (metric with bounded labels) self.histogram_transfer_size: dict[tuple[int, str], PromMetricT] = {} self.counter_kv_bytes: dict[tuple[int, str], PromMetricT] = {} self.counter_kv_transfer_time: dict[tuple[int, str], PromMetricT] = {} diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 03a926d9e..25438a8f2 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -1647,9 +1647,9 @@ class OpenAIServingResponses(OpenAIServing): # TODO: in streaming, we noticed this bug: # https://github.com/vllm-project/vllm/issues/25697 await self._initialize_tool_sessions(request, context, exit_stack) - processer = self._process_harmony_streaming_events + processor = self._process_harmony_streaming_events else: - processer = self._process_simple_streaming_events + processor = self._process_simple_streaming_events # TODO Hanchen make sampling params to include the structural tag initial_response = ResponsesResponse.from_request( @@ -1677,7 +1677,7 @@ class OpenAIServingResponses(OpenAIServing): ) try: - async for event_data in processer( + async for event_data in processor( request, sampling_params, result_generator, diff --git a/vllm/envs.py b/vllm/envs.py index 598545d23..66ddd7918 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1520,7 +1520,7 @@ environment_variables: dict[str, Callable[[], Any]] = { os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024") ), # Force DeepEP to use intranode kernel for inter-node communication in - # high throughput mode. This is useful archive higher prefill throuhgput + # high throughput mode. This is useful archive higher prefill throughput # on system supports multi-node nvlink (e.g GB200). "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool( int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0")) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 15e3263ba..bf0f9da6e 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -175,7 +175,7 @@ class DPMetadata: # Get the cumulative tokens across sequence parallel ranks. # In this case the input to the MoEs will be distributed w.r.t both # DP and TP rank. - # When sp_size==1, this is just the cummulative num tokens across DP. + # When sp_size==1, this is just the cumulative num tokens across DP. def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor: num_tokens_across_sp_cpu = ( self.num_tokens_across_dp_cpu - 1 + sp_size diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index 958aa6af3..8de5822db 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -57,10 +57,10 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): input_parallel = input_ else: # TODO: simplify code below - splitted_input = split_tensor_along_last_dim( + split_input = split_tensor_along_last_dim( input_, num_partitions=self.tp_size ) - input_parallel = splitted_input[self.tp_rank].contiguous() + input_parallel = split_input[self.tp_rank].contiguous() # Matrix multiply. bias_ = ( diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py index e9e0a711a..7c1dd39bb 100644 --- a/vllm/lora/lora_model.py +++ b/vllm/lora/lora_model.py @@ -11,7 +11,7 @@ from vllm.lora.lora_weights import LoRALayerWeights from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import ( get_lora_id, - is_base_embeddding_weights, + is_base_embedding_weights, parse_fine_tuned_lora_name, ) from vllm.model_executor.model_loader.tensorizer import TensorizerConfig @@ -86,7 +86,7 @@ class LoRAModel: pin_memory = str(device) == "cpu" and is_pin_memory_available() loras: dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): - if is_base_embeddding_weights(tensor_name): + if is_base_embedding_weights(tensor_name): continue # Skip modules based on model-defined prefixes (e.g., MTP layers) if skip_prefixes and cls._should_skip_module(tensor_name, skip_prefixes): @@ -162,7 +162,7 @@ class LoRAModel: def check_unexpected_modules(modules: dict): for lora_module in modules.keys(): # noqa - if is_base_embeddding_weights(lora_module): + if is_base_embedding_weights(lora_module): continue # Handle PEFT file format where experts.base_layer is the # gate_up_proj and experts is the down_proj diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 9b23d7e0c..6fef61dba 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -193,7 +193,7 @@ def parse_fine_tuned_lora_name( raise ValueError(f"{name} is unsupported LoRA weight") -def is_base_embeddding_weights(name: str) -> bool: +def is_base_embedding_weights(name: str) -> bool: # hardcoded subfixes for input & output embedding weights embedding_suffixes = ( ".embed_tokens.base_layer.weight", diff --git a/vllm/model_executor/kernels/linear/mixed_precision/cpu.py b/vllm/model_executor/kernels/linear/mixed_precision/cpu.py index 5a9d7c372..d5ca625f0 100644 --- a/vllm/model_executor/kernels/linear/mixed_precision/cpu.py +++ b/vllm/model_executor/kernels/linear/mixed_precision/cpu.py @@ -82,7 +82,7 @@ class CPUWNA16LinearKernel(MPLinearKernel): weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous() weight = pack_quantized_values_into_int32(weight, self.config.weight_type, 1) # make 16 output channel as a block and transpose to the make - # the block contigous + # the block contiguous weight = ( weight.view(input_size, -1, 16 // pack_factor) .permute(1, 0, 2) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 25bc57de6..926e8892e 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -2540,7 +2540,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): ) # workspace # |------- N tokens --------|--------- N*dcp_size tokens ----------| - # |<- use for loca_gather ->|<--------- use for allgather -------->| + # |<- use for local_gather ->|<--------- use for allgather -------->| allgather_offset = workspace.shape[0] // (dcp_world_size + 1) assert allgather_offset * (dcp_world_size + 1) == workspace.shape[0] assert toks <= allgather_offset diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 02c31fd39..4ee2aab25 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -394,5 +394,5 @@ class FlashInferExperts(mk.FusedMoEExpertsModular): def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None: # No support for LoRA in flashinfer_cutlass_fused_moe. - # See TODOs in flashinfer functions runMoe and runMoeMinLantency. + # See TODOs in flashinfer functions runMoe and runMoeMinLatency. raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe") diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 280d09079..5370b9e28 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -409,7 +409,7 @@ def batched_fused_marlin_moe( Note that the moe_align_block_size function indicates, - What rows of the A matrix (hidden_states) to access during the matmul, via sorted_ids output. - - What expert_id to use for each block matmul, via expert_ids ouptut. + - What expert_id to use for each block matmul, via expert_ids output. In the batched version, the tokens are already grouped/batched by experts they subscribe to. Due to this, we can represent the batched hidden_states diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 7b49282fd..1f495169b 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -606,7 +606,7 @@ class FusedMoEExperts(ABC): """ Whether the kernel supports deployment in particular parallel config. - Can be overriden if a kernel does not support EP, SP or some other + Can be overridden if a kernel does not support EP, SP or some other configuration. """ raise NotImplementedError @@ -620,7 +620,7 @@ class FusedMoEExperts(ABC): """ Whether the kernel supports a routing method (e.g. GroupedTopK). - Can be overriden by monolithic kernels that execute the router + Can be overridden by monolithic kernels that execute the router in addition to the experts if certain routers are not supported. """ return True @@ -633,7 +633,7 @@ class FusedMoEExperts(ABC): """ Whether a kernel supports a particular dtype for router logits input. - Can be overriden by monolithic kernels that execute the router + Can be overridden by monolithic kernels that execute the router in addition to the experts if certain dtypes are not supported. """ return True diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dfe180883..3d0430c31 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -1502,10 +1502,10 @@ class RowParallelLinear(LinearBase): if self.input_is_parallel: input_parallel = input_ else: - splitted_input = split_tensor_along_last_dim( + split_input = split_tensor_along_last_dim( input_, num_partitions=self.tp_size ) - input_parallel = splitted_input[self.tp_rank].contiguous() + input_parallel = split_input[self.tp_rank].contiguous() # Matrix multiply. assert self.quant_method is not None diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index d0701b6d1..1d3e987b7 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -35,7 +35,7 @@ class MultiHeadLatentAttentionWrapper(PluggableLayer): """Pluggable MLA layer which allows OOT backends to add custom implementations of the outer MLA layer (including rope & o_proj). Note that currently oot platforms can still use CustomOp.register_oot to - replace MLA layer entirly, although we use PluggableLayer to register + replace MLA layer entirely, although we use PluggableLayer to register this layer now. This class takes positions and hidden_states as input. diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 00a17596a..4fcc468c6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -191,7 +191,7 @@ class CompressedTensorsConfig(QuantizationConfig): """ Helper function to update target_scheme_map since linear layers get fused into FusedMoE - targetting 'Linear' needs to also match + targeting 'Linear' needs to also match FusedMoE modules. """ if ( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index f6c0009a5..f3ed9a628 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -2445,7 +2445,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): w2_scale=layer.w2_weight_scale, # group scale g1_alphas=layer.w13_weight_chan_scale, g2_alphas=layer.w2_weight_chan_scale, - per_act_token_quant=True, # always use dynamc per-token + per_act_token_quant=True, # always use dynamic per-token per_out_ch_quant=True, # always use per-channel ) diff --git a/vllm/model_executor/layers/quantization/cpu_wna16.py b/vllm/model_executor/layers/quantization/cpu_wna16.py index 406b86ab2..21e59a6f1 100644 --- a/vllm/model_executor/layers/quantization/cpu_wna16.py +++ b/vllm/model_executor/layers/quantization/cpu_wna16.py @@ -261,7 +261,7 @@ class CPUAWQLinearMethod(LinearMethodBase): zeros = pack_cols(zeros, bits, group_num, output_size).contiguous() # make 16 output channel as a block and transpose to - # the make the block contigous + # the make the block contiguous weight = pack_cols(weight, bits, input_size, output_size) weight = ( weight.view(input_size, -1, 16 // pack_factor) diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index f195efbbc..3c6fdf043 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -199,7 +199,7 @@ class TorchAOConfig(QuantizationConfig): @classmethod def from_config_dict_json(cls, config_dict_json: str) -> "TorchAOConfig": - """Iniitalize class from a config_dict json string, got from + """Initialize class from a config_dict json string, got from torchao_config_object = some AOBaseConfig object json.dumps(config_to_dict(torchao_config_object)) """ diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 41d44e0c4..78b123402 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -255,7 +255,7 @@ def _flashinfer_fp8_blockscale_gemm_impl( This batch-size-dependent selection is essential for maintaining model accuracy. Benchmarks on GSM8K show a significant accuracy gap (88% vs 95%) for DeepSeek-V3.1 - when using FlashInfer's DeepGEMM on M>=32. The M < 32 strategy fixes the accurracy + when using FlashInfer's DeepGEMM on M>=32. The M < 32 strategy fixes the accuracy drop. Args: diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py index ccfcdac1e..95d8102ea 100644 --- a/vllm/model_executor/layers/quantization/utils/machete_utils.py +++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py @@ -39,7 +39,7 @@ def query_machete_supported_group_sizes(act_type: torch.dtype) -> list[int]: def check_machete_supports_shape( - in_features: int, out_featrues: int + in_features: int, out_features: int ) -> tuple[bool, str | None]: if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0: return ( @@ -47,7 +47,7 @@ def check_machete_supports_shape( "Input features size must be divisible by " f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}", ) - if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0: + if out_features % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0: return ( False, "Output features size must be divisible by " diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 2cca86b05..e0576ee8e 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -237,7 +237,7 @@ class ApplyRotaryEmb(CustomOp): Arguments of apply_rotary_emb() in vllm_flash_attn: x: [batch_size, seq_len, nheads, headdim] cos, sin: [seqlen_rotary, rotary_dim / 2] - interleaved: defalut as False (Neox-style). + interleaved: default as False (Neox-style). ... """ interleaved = not self.is_neox_style @@ -259,7 +259,7 @@ class ApplyRotaryEmb(CustomOp): Arguments of apply_rotary() in flash_attn: x: [batch_size, seq_len, nheads, headdim] cos, sin: [seqlen_rotary, rotary_dim / 2] - interleaved: defalut as False (Neox-style). + interleaved: default as False (Neox-style). ... """ interleaved = not self.is_neox_style diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 376de71ad..418fdcfa0 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -342,7 +342,7 @@ class Ernie4_5_VLMoeMoE(nn.Module): visual_token_mask = visual_token_mask.repeat(1, self.hidden_size).bool() text_token_mask = ~visual_token_mask final_experts_hidden_states = torch.zeros_like(hidden_states) - final_shared_ouput = ( + final_shared_output = ( torch.zeros_like(hidden_states) if self.has_shared_experts else None ) @@ -356,26 +356,26 @@ class Ernie4_5_VLMoeMoE(nn.Module): text_router_logits, _ = self.text_experts_gate( text_hidden_states.to(dtype=torch.float32) ) - text_shared_ouput, text_experts_output = self.text_experts( + text_shared_output, text_experts_output = self.text_experts( hidden_states=text_hidden_states, router_logits=text_router_logits ) final_experts_hidden_states[text_token_mask] = text_experts_output.flatten() if self.has_shared_experts: - final_shared_ouput[text_token_mask] = text_shared_ouput.flatten() + final_shared_output[text_token_mask] = text_shared_output.flatten() vision_router_logits, _ = self.vision_experts_gate( vision_hidden_states.to(dtype=torch.float32) ) - vision_shared_ouput, vision_experts_output = self.vision_experts( + vision_shared_output, vision_experts_output = self.vision_experts( hidden_states=vision_hidden_states, router_logits=vision_router_logits ) final_experts_hidden_states[visual_token_mask] = ( vision_experts_output.flatten() ) if self.has_shared_experts: - final_shared_ouput[visual_token_mask] = vision_shared_ouput.flatten() + final_shared_output[visual_token_mask] = vision_shared_output.flatten() - final_hidden_states = (final_shared_ouput, final_experts_hidden_states) + final_hidden_states = (final_shared_output, final_experts_hidden_states) else: # only text modal input text_router_logits, _ = self.text_experts_gate( diff --git a/vllm/model_executor/models/fireredasr2.py b/vllm/model_executor/models/fireredasr2.py index f0d3e124c..981c65472 100644 --- a/vllm/model_executor/models/fireredasr2.py +++ b/vllm/model_executor/models/fireredasr2.py @@ -107,7 +107,7 @@ class Conv2dSubsampling(nn.Module): ) self.subsampling = 4 - left_context = right_context = 3 # both exclude currect frame + left_context = right_context = 3 # both exclude current frame self.context = left_context + 1 + right_context # 7 def forward( diff --git a/vllm/model_executor/models/funasr.py b/vllm/model_executor/models/funasr.py index de2e4409e..fd4e2c06d 100644 --- a/vllm/model_executor/models/funasr.py +++ b/vllm/model_executor/models/funasr.py @@ -115,7 +115,7 @@ class EncoderLayerSANM(nn.Module): hidden_states: torch.Tensor, mask: torch.Tensor | None = None, cache=None, - mask_shfit_chunk=None, + mask_shift_chunk=None, mask_att_chunk_encoder=None, ): residual = hidden_states @@ -125,14 +125,14 @@ class EncoderLayerSANM(nn.Module): hidden_states = residual + self.self_attn( hidden_states, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) else: hidden_states = self.self_attn( hidden_states, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) @@ -140,7 +140,7 @@ class EncoderLayerSANM(nn.Module): hidden_states = self.norm2(hidden_states) hidden_states = residual + self.feed_forward(hidden_states) - return hidden_states, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder + return hidden_states, mask, cache, mask_shift_chunk, mask_att_chunk_encoder class MultiHeadedAttentionSANM(nn.Module): @@ -183,13 +183,13 @@ class MultiHeadedAttentionSANM(nn.Module): self, inputs: torch.Tensor, mask: torch.Tensor, - mask_shfit_chunk: torch.Tensor = None, + mask_shift_chunk: torch.Tensor = None, ): b, t, d = inputs.size() if mask is not None: mask = torch.reshape(mask, (b, -1, 1)) - if mask_shfit_chunk is not None: - mask = mask * mask_shfit_chunk + if mask_shift_chunk is not None: + mask = mask * mask_shift_chunk inputs = inputs * mask x = inputs.transpose(1, 2) @@ -243,11 +243,11 @@ class MultiHeadedAttentionSANM(nn.Module): self, hidden_states: torch.Tensor, mask: torch.Tensor, - mask_shfit_chunk: torch.Tensor = None, + mask_shift_chunk: torch.Tensor = None, mask_att_chunk_encoder: torch.Tensor = None, ): q_h, k_h, v_h, v = self.forward_qkv(hidden_states) - fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) + fsmn_memory = self.forward_fsmn(v, mask, mask_shift_chunk) q_h = q_h * self.d_k ** (-0.5) scores = torch.matmul(q_h, k_h.transpose(-2, -1)) att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 6d8b45a7a..b9655a08c 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -646,7 +646,7 @@ class IsaacImageProcessor: return_tensors: str | TensorType | None, **kwargs: Unpack[IsaacImageProcessorKwargs], ) -> BatchFeature: - """Preprocess images into format compatibile with vLLM input processing.""" + """Preprocess images into format compatible with vLLM input processing.""" all_pixel_values: list[torch.Tensor] = [] all_image_grids: list[torch.Tensor] = [] diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 4c43e413f..5e062fa74 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -299,7 +299,7 @@ class KeyeVisionEmbeddings(nn.Module): ) ( batch_size, - squence_len, + sequence_len, channel, height, width, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index c90cc2d39..a9e2c2268 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -238,7 +238,7 @@ class LongcatRouter(nn.Module): self, config: FlashConfig, zero_expert_num: int, - rounter_params_dtype: torch.dtype, + router_params_dtype: torch.dtype, prefix: str = "", ): super().__init__() @@ -252,12 +252,12 @@ class LongcatRouter(nn.Module): config.hidden_size, self.n_routed_experts, bias=config.router_bias, - params_dtype=rounter_params_dtype, + params_dtype=router_params_dtype, quant_config=None, prefix=f"{prefix}.classifier", ) self.e_score_correction_bias = nn.Parameter( - torch.zeros((self.n_routed_experts), dtype=rounter_params_dtype) + torch.zeros((self.n_routed_experts), dtype=router_params_dtype) ) def forward(self, hidden_states): @@ -281,14 +281,14 @@ class LongcatMoe(nn.Module): super().__init__() self.hidden_size = hidden_size # Gate always runs at half / full precision for now. - self.rounter_params_dtype = params_dtype + self.router_params_dtype = params_dtype if config.router_dtype == "float32": - self.rounter_params_dtype = torch.float32 + self.router_params_dtype = torch.float32 self.router = LongcatRouter( config=config, zero_expert_num=config.zero_expert_num, - rounter_params_dtype=self.rounter_params_dtype, + router_params_dtype=self.router_params_dtype, prefix=f"{prefix}.gate", ) @@ -309,7 +309,7 @@ class LongcatMoe(nn.Module): prefix=f"{prefix}.experts", enable_eplb=enable_eplb, routed_scaling_factor=config.routed_scaling_factor, - router_logits_dtype=self.rounter_params_dtype, + router_logits_dtype=self.router_params_dtype, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -329,7 +329,7 @@ class LongcatMoe(nn.Module): hidden_states_padded = hidden_states router_logits_full = self.router( - hidden_states_padded.to(self.rounter_params_dtype) + hidden_states_padded.to(self.router_params_dtype) ) # ZeroExpertFusedMoE handles routing memoization and zero expert computation diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index b2e91616a..d8f3cf571 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -1321,14 +1321,14 @@ def get_image_size(image: ImageInput) -> ImageSize: raise ValueError(f"Unknown image type: {type(image)}") -def exif_tranpose( +def exif_transpose( images: ImageInput | None, ) -> ImageInput | None: if images is None: return None if images is not None and isinstance(images, (list, tuple)): images = [ - exif_tranpose(img) if isinstance(img, Image) else img for img in images + exif_transpose(img) if isinstance(img, Image) else img for img in images ] elif images is not None and isinstance(images, Image): images = ImageOps.exif_transpose(images) @@ -1667,7 +1667,7 @@ class Molmo2ProcessorWrapper: **kwargs: object, ) -> BatchFeature: inputs = [text] - images = exif_tranpose(images) + images = exif_transpose(images) if getattr(self.processor, "image_processor", None) is not None: inputs.append(images) if getattr(self.processor, "video_processor", None) is not None: @@ -2352,7 +2352,7 @@ class Molmo2MultiModalProcessor(BaseMultiModalProcessor[Molmo2ProcessingInfo]): def get_image_replacement_molmo2(item_idx: int) -> list[int]: images = mm_items.get_items("image", ImageProcessorItems) image = images.get(item_idx) - image = exif_tranpose(image) + image = exif_transpose(image) resize_nrows, resize_cols = processor.get_base_grid_size(is_video=False) if use_single_crop_col_tokens is not None: diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 39ea0ea48..859e34a10 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -349,7 +349,7 @@ class NemotronHMoEDecoderLayer(nn.Module): super().__init__() self.config = config - # Get per-layer config for heterogeneous models if exsist + # Get per-layer config for heterogeneous models if exists get_layer_config = getattr(config, "get_nemotron_h_config_for_layer", None) layer_config = get_layer_config(layer_idx) if get_layer_config else config @@ -517,7 +517,7 @@ class NemotronHAttentionDecoderLayer(nn.Module): ) -> None: super().__init__() - # Get per-layer config for heterogeneous models if exsist + # Get per-layer config for heterogeneous models if exists get_layer_config = getattr(config, "get_nemotron_h_config_for_layer", None) layer_config = get_layer_config(layer_idx) if get_layer_config else config diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 35132e724..74c9f8c22 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -486,7 +486,7 @@ class SiglipVisionEmbeddings(nn.Module): ) ( batch_size, - squence_len, + sequence_len, channel, height, width, diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 7f0a6f16a..c3b09ed59 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -689,19 +689,19 @@ class ConformerEncoder(TransformerEncoderBase): default False. ext_pw_out_channel: int, optional the number of channel for CNN - before depthwise_seperable_CNN. + before depthwise_separable_CNN. If 0 then use linear. default 0. ext_pw_kernel_size: int, optional - kernel size of N before depthwise_seperable_CNN. + kernel size of N before depthwise_separable_CNN. only work for ext_pw_out_channel > 0. default 1 depthwise_seperable_out_channel: int, optional the number of channel for - depthwise_seperable_CNN. + depthwise_separable_CNN. default 256. depthwise_multiplier: int, optional the number of multiplier for - depthwise_seperable_CNN. + depthwise_separable_CNN. default 1. chunk_se: int, optional 0 for offline SE. @@ -711,7 +711,7 @@ class ConformerEncoder(TransformerEncoderBase): by only the current chunk. default 0. kernel_size: int, optional - the number of kernels for depthwise_seperable_CNN. + the number of kernels for depthwise_separable_CNN. default 3. activation: str, optional FeedForward block activation. @@ -721,7 +721,7 @@ class ConformerEncoder(TransformerEncoderBase): activation function used in ConvModule part of the conformer, default "relu". conv_glu_type: str, optional - activation used use glu in depthwise_seperable_CNN, + activation used use glu in depthwise_separable_CNN, default "sigmoid" bias_in_glu: bool, optional if set to True, use additive bias in the weight module diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index e9c13b3ee..0965f2816 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -217,8 +217,8 @@ class GLUPointWiseConv(nn.Module): return x -class DepthWiseSeperableConv1d(nn.Module): - """DepthWiseSeperableConv1d module used in Convnet module +class DepthWiseSeparableConv1d(nn.Module): + """DepthWiseSeparableConv1d module used in ConvNet module for the conformer, for more details see: https://arxiv.org/pdf/2005.08100v1.pdf @@ -390,7 +390,7 @@ class ConvModule(nn.Module): else: padding = (kernel_size - 1) // 2 - self.dw_sep_conv_1d = DepthWiseSeperableConv1d( + self.dw_sep_conv_1d = DepthWiseSeparableConv1d( input_dim, depthwise_seperable_out_channel, kernel_size, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index aeacd99eb..a8840022a 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -916,7 +916,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): self, max_pixels: int | None = None ) -> ImageSize: # NOTE: Simply processing a huge size with _get_vision_info might not give a - # size that maximizes the number of featrues, i.e., the number of (merged) + # size that maximizes the number of features, i.e., the number of (merged) # patches. This is because the number of patches limits the allowed aspect # ratios. For example, suppose the maximum number of patches is 1280. A square # image cannot be broken down into 1280 patches, so feeding a giant square image diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index eee1130cc..8e5bd450e 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -459,14 +459,14 @@ class Step3VLProcessor: image_inputs = {} text_inputs = self.tokenizer(text) else: - splitted_images_data = self._split_images(images) + split_images_data = self._split_images(images) pixel_values_lst = [] patch_pixel_values_lst = [] patch_newline_mask_lst = [] image_repl_str_lst = [] image_repl_ids_lst = [] num_patches = [] - for raw_img, img_patches, patch_newline_mask in splitted_images_data: + for raw_img, img_patches, patch_newline_mask in split_images_data: pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img])) if len(img_patches) > 0: diff --git a/vllm/model_executor/models/step3p5.py b/vllm/model_executor/models/step3p5.py index fcdd770fe..bb4bf14a9 100644 --- a/vllm/model_executor/models/step3p5.py +++ b/vllm/model_executor/models/step3p5.py @@ -353,7 +353,7 @@ class FusedMoEBlock(nn.Module): if swiglu_limit not in (None, 0): swiglu_limit = float(swiglu_limit) assert swiglu_limit == 7.0, ( - "Swiglu limit in fused moe block only suport 7.0 now." + "Swiglu limit in fused moe block only support 7.0 now." ) activation = "swiglustep" logger.debug( diff --git a/vllm/reasoning/ernie45_reasoning_parser.py b/vllm/reasoning/ernie45_reasoning_parser.py index 6ff86488b..3f04876b6 100644 --- a/vllm/reasoning/ernie45_reasoning_parser.py +++ b/vllm/reasoning/ernie45_reasoning_parser.py @@ -18,7 +18,7 @@ logger = init_logger(__name__) class Ernie45ReasoningParser(BaseThinkingReasoningParser): """ Reasoning parser for Ernie45 thinking model. - The Ernie45 thinking model ouput format is + The Ernie45 thinking model output format is abc\n\n\n\ndef\n\n or abc\n\ndef """ @@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): Extract reasoning content from a delta message. Handles streaming output where previous + delta = current. Uses token IDs for faster processing. - The Ernie45 thinking model ouput format is + The Ernie45 thinking model output format is abc\n\n\n\ndef\n\n or abc\n\ndef - 'abc' goes to reasoning @@ -148,7 +148,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ) -> tuple[str | None, str | None]: """ Extract reasoning content from the model output. - The Ernie45 thinking model ouput format is + The Ernie45 thinking model output format is abc\n\n\n\n\ndef\n\n or abc\n\ndef - 'abc' goes to reasoning diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index a2c281b9d..191a39926 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -564,7 +564,7 @@ def replace_vision_chunk_video_placeholder( mm_data: "MultiModalDataDict", video_placeholder: str | None, ) -> str | list[int]: - # get video placehoder, replace it with runtime video-chunk prompts + # get video placeholder, replace it with runtime video-chunk prompts if video_placeholder and isinstance(prompt_raw, str): video_prompts = build_video_prompts_from_mm_data(mm_data) diff --git a/vllm/renderers/inputs/preprocess.py b/vllm/renderers/inputs/preprocess.py index d40a16fc4..e972d0755 100644 --- a/vllm/renderers/inputs/preprocess.py +++ b/vllm/renderers/inputs/preprocess.py @@ -1,5 +1,5 @@ """ -Schemas and utilites for preprocessing inputs. +Schemas and utilities for preprocessing inputs. """ # SPDX-License-Identifier: Apache-2.0 diff --git a/vllm/renderers/inputs/tokenize.py b/vllm/renderers/inputs/tokenize.py index 3734fac99..4168e2012 100644 --- a/vllm/renderers/inputs/tokenize.py +++ b/vllm/renderers/inputs/tokenize.py @@ -1,5 +1,5 @@ """ -Schemas and utilites for tokenization inputs. +Schemas and utilities for tokenization inputs. """ # SPDX-License-Identifier: Apache-2.0 diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 9ef006c9f..bf460bb79 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -169,7 +169,7 @@ def _prepare_apply_chat_template_tools_and_messages( tool.pop(tool_key) logger.warning_once( f"'{tool_key}' is not supported by mistral-common for tools. " - "It has been poped from the tool definition." + "It has been popped from the tool definition." ) if tool["type"] == "function": function_keys = list(tool["function"].keys()) @@ -178,7 +178,7 @@ def _prepare_apply_chat_template_tools_and_messages( tool["function"].pop(function_key) logger.warning_once( f"'{function_key}' is not supported by mistral-common " - "for function tools. It has been poped from the " + "for function tools. It has been popped from the " "function definition." ) else: diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py index f1bcefc1a..6b6fdcace 100644 --- a/vllm/transformers_utils/processors/ovis2_5.py +++ b/vllm/transformers_utils/processors/ovis2_5.py @@ -402,7 +402,7 @@ class Ovis2_5Processor(ProcessorMixin): images = [images] elif video is not None: is_video = True - # type of vidoe in dummy_mm_data is np.ndarray + # type of video in dummy_mm_data is np.ndarray if isinstance(video, np.ndarray): images = [] for i in range(video.shape[0]): diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 980a86360..511387aac 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -174,7 +174,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] query_start_loc = query_start_loc[: num_decodes + 1] block_table_tensor = block_table_tensor[:num_decodes] - sheduler_metadata = ops.cpu_attn_get_scheduler_metadata( + scheduler_metadata = ops.cpu_attn_get_scheduler_metadata( num_reqs=num_reqs, num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, @@ -197,7 +197,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] seq_lens=seq_lens, block_table=block_table_tensor, slot_mapping=slot_mapping, - scheduler_metadata=sheduler_metadata, + scheduler_metadata=scheduler_metadata, causal=causal, use_sdpa_prefill=self.use_sdpa_prefill, num_decode_tokens=num_decode_tokens, diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index f9105474e..0364d6aee 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -383,7 +383,7 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): # Return a tensor of shape (#requests, #max blocks) state_indices_tensor = common_attn_metadata.block_table_tensor - # Additional cache-related varaiables: + # Additional cache-related variables: mamba_block_size = self.kv_cache_spec.block_size ( block_idx_last_computed_token, diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index e04a7688f..c8a78af4a 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -49,14 +49,14 @@ if TYPE_CHECKING: logger = init_logger(__name__) -# For FP8 sparse attention we have two impelementations: +# For FP8 sparse attention we have two implementations: # 1. Mixed batch mode: use the FP8 decode kernel for both prefill and decode this is # done by treating all tokens as single batch. # 2. Separate prefill and decode mode: use the BF16 prefill kernel for prefill # (upconverting the FP8 cache to BF16 then calling the prefill kernel) and using # the FP8 decode kernel for decode. # Currently we use #1 when the number of heads per rank is low (i.e. TP) since the BF16 -# prefill kernel requires padding the numer of heads to 128 while the decode does not +# prefill kernel requires padding the number of heads to 128 while the decode does not # so when the per ranke head count is below MIN_HEADS_FOR_BF16_PREFILL we use the mixed # batch mode (#2). MIN_HEADS_FOR_BF16_PREFILL = 32 @@ -126,7 +126,7 @@ class FlashMLASparseBackend(AttentionBackend): cache_dtype_str: str = "auto", ) -> tuple[int, ...]: if cache_dtype_str == "fp8_ds_mla": - # custom storage fromat is 656 bytes + # custom storage format is 656 bytes # see FlashMLA readme.md for details return (num_blocks, block_size, 656) else: diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 41147ca63..c0269ec68 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -370,7 +370,7 @@ class AiterFlashAttentionMetadata: slot_mapping: torch.Tensor block_table: torch.Tensor - # prefill and deocde split + # prefill and decode split num_decodes: int num_decode_tokens: int num_prefills: int @@ -1099,7 +1099,7 @@ class AiterFlashAttentionImpl(AttentionImpl): extend_tokens_slice = slice( num_decode_tokens, num_decode_tokens + num_extend_tokens ) - extend_querys = query[extend_tokens_slice] + extend_queries = query[extend_tokens_slice] extend_keys = key[extend_tokens_slice] extend_values = value[extend_tokens_slice] extend_outputs = output[extend_tokens_slice] @@ -1110,7 +1110,7 @@ class AiterFlashAttentionImpl(AttentionImpl): v_scale = attn_metadata.v_scale self.extend_forward( attn_metadata=attn_metadata, - query=extend_querys, + query=extend_queries, key=extend_keys, value=extend_values, key_cache=key_cache, diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index c071ae155..f0146514b 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -863,7 +863,7 @@ class MambaManager(SingleTypeKVCacheManager): ): # Mamba can't rely on blocks generated by other requests in the current step # To put it in the next step, we return num_gpu_blocks + 1 so - # that kv_cache_manager will think there is no enough blocks to allocte now + # that kv_cache_manager will think there is no enough blocks to allocate now # and don't schedule it in the current step. return self.block_pool.num_gpu_blocks + 1 if self.mamba_cache_mode != "align": diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 9b70e4a9c..d8e002da5 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1724,11 +1724,11 @@ class DPEngineCoreProc(EngineCoreProc): """ Send notifications to EngineCoreClient, which can then forward the notifications to other engine core processes. It is used for: - 1) In scale up: new core engines to notify exisiting core engines + 1) In scale up: new core engines to notify existing core engines that they are ready; 2) In scale down: removing core engines to notify EngineCoreClient so EngineCoreClient can release their ray placement groups; - 3) Both scale up/down: to notify EngineCoreClient that exisiting + 3) Both scale up/down: to notify EngineCoreClient that existing core engines have already switched to the new parallel setup. """ if vllm_config is None: diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index ad70f839d..fe062bde4 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -194,7 +194,7 @@ class InputProcessor: @staticmethod def assign_request_id(request: EngineCoreRequest): """Replace the externally supplied request ID with an internal request ID - that adds 8 random characters in order to ensure uniquness. + that adds 8 random characters in order to ensure uniqueness. """ if request.external_req_id is not None: raise ValueError( diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index 5cde5faa4..4ce357437 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -197,7 +197,7 @@ class SingleDirectionOffloadingHandler(OffloadingHandler): transfer = self._transfers.popleft() transfer_time = ( transfer.start_event.elapsed_time(transfer.end_event) * 1e-3 - ) # elapsed_time is in miliseconds + ) # elapsed_time is in milliseconds result = TransferResult( job_id=transfer.job_id, success=True, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 29a5e46ab..91db40980 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -905,7 +905,7 @@ class GPUModelRunner( Args: scheduler_output: The scheduler output. """ - # Attention free models have zero kv_cache_goups, however models + # Attention free models have zero kv_cache_groups, however models # like Mamba are also attention free but use the kv_cache for # keeping its internal state. This is why we check the number # of kv_cache groups instead of solely checking @@ -1065,7 +1065,7 @@ class GPUModelRunner( # of the request. for example: # fist step: num_computed_tokens = 0, spec_tokens = [], # prev_num_draft_len = 0. - # second step: num_computed_tokens = 100(prompt lenth), + # second step: num_computed_tokens = 100(prompt length), # spec_tokens = [a,b], prev_num_draft_len = 0. # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d], # prev_num_draft_len = 2. @@ -1412,30 +1412,30 @@ class GPUModelRunner( prev_draft_token_indices.extend(range(start, start + draft_len)) indices_match &= prev_index == flattened_index max_flattened_index = max(max_flattened_index, flattened_index) - num_commmon_tokens = len(sample_flattened_indices) + num_common_tokens = len(sample_flattened_indices) total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens - if num_commmon_tokens < total_without_spec: + if num_common_tokens < total_without_spec: # If not all requests are decodes from the last iteration, # We need to copy the input_ids_cpu to the GPU first. self.input_ids.copy_to_gpu(total_num_scheduled_tokens) if self.enable_prompt_embeds: self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) - if num_commmon_tokens == 0: + if num_common_tokens == 0: # No requests in common with the previous iteration # So input_ids.cpu will have all the input ids. return - if indices_match and max_flattened_index == (num_commmon_tokens - 1): + if indices_match and max_flattened_index == (num_common_tokens - 1): # Common-case optimization: the batch is unchanged # and no reordering happened. # The indices are both the same permutation of 0..N-1 so # we can copy directly using a single slice. - self.input_ids.gpu[:num_commmon_tokens].copy_( - self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0], + self.input_ids.gpu[:num_common_tokens].copy_( + self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0], non_blocking=True, ) if self.enable_prompt_embeds: - self.is_token_ids.gpu[:num_commmon_tokens] = True + self.is_token_ids.gpu[:num_common_tokens] = True return # Upload the index tensors asynchronously so the scatter can be non-blocking. sampled_tokens_index_tensor = torch.tensor( @@ -4383,7 +4383,7 @@ class GPUModelRunner( self.model.compile(fullgraph=True, backend=backend) return # for other compilation modes, cudagraph behavior is controlled by - # CudagraphWraper and CudagraphDispatcher of vllm. + # CudagraphWrapper and CudagraphDispatcher of vllm. # wrap the model with full cudagraph wrapper if needed. cudagraph_mode = self.compilation_config.cudagraph_mode @@ -4444,7 +4444,7 @@ class GPUModelRunner( :param weights_path: path to load weights from if weights_iterator is not provided. Use path of original model if neither is provided. :param is_checkpoint_format: set to False if weights have already been processed - into kernel format (repacking, renaming, ect.) + into kernel format (repacking, renaming, etc.) """ # TODO(@kylesayrs): generalize to all runners and loaders # argument validation