diff --git a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py index 80bb4d846..e694c3878 100644 --- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py @@ -393,7 +393,7 @@ if __name__ == "__main__": with open(results_folder / md_file, "w") as f: results = read_markdown( "../.buildkite/performance-benchmarks/" - + "performance-benchmarks-descriptions.md" + "performance-benchmarks-descriptions.md" ) results = results.format( latency_tests_markdown_table=latency_md_table, diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 8bd17ba69..e12601e9e 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -288,8 +288,8 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str: ) cluster_shape = ( f"{schedule_config.cluster_shape_mnk[0]}" - + f"x{schedule_config.cluster_shape_mnk[1]}" - + f"x{schedule_config.cluster_shape_mnk[2]}" + f"x{schedule_config.cluster_shape_mnk[1]}" + f"x{schedule_config.cluster_shape_mnk[2]}" ) kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split( "::" @@ -301,7 +301,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str: return ( f"{tile_shape}_{cluster_shape}_{kernel_schedule}" - + f"_{epilogue_schedule}_{tile_scheduler}" + f"_{epilogue_schedule}_{tile_scheduler}" ) diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/offline_inference/automatic_prefix_caching.py index a01a9565a..2d3c28d9d 100644 --- a/examples/offline_inference/automatic_prefix_caching.py +++ b/examples/offline_inference/automatic_prefix_caching.py @@ -26,7 +26,7 @@ from vllm import LLM, SamplingParams # A prompt containing a large markdown table. The table is randomly generated by GPT-4. LONG_PROMPT = ( "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" - + """ + """ | ID | Name | Age | Occupation | Country | Email | Phone Number | Address | |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| | 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py index c8965e050..e048aecff 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py @@ -69,10 +69,10 @@ class StatsCalculator: np_arr = np.array(self._stats) output_str = ( f"\nNum requests: {len(self._stats)}" - + "\nPrefill node TTFT stats:" - + f"\n - Average (ms): {np.mean(np_arr)}" - + f"\n - Median (ms): {np.median(np_arr)}" - + f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n" + "\nPrefill node TTFT stats:" + f"\n - Average (ms): {np.mean(np_arr)}" + f"\n - Median (ms): {np.median(np_arr)}" + f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n" ) print( "===============================", diff --git a/pyproject.toml b/pyproject.toml index e1b5671de..10f184d2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,6 +72,8 @@ select = [ "UP", # flake8-bugbear "B", + # flake8-implicit-str-concat + "ISC", # flake8-simplify "SIM", # isort diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py index a0576db02..914348153 100644 --- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py @@ -193,7 +193,7 @@ TEST_CASES = [ pytest.param( False, "<|python_start|>[get_weather(city='LA', metric='C'), " - + "register_user(name='Doe', age=9)]", + "register_user(name='Doe', age=9)]", [ SIMPLE_FUNCTION_CALL, FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'), diff --git a/tests/tool_parsers/test_deepseekv31_tool_parser.py b/tests/tool_parsers/test_deepseekv31_tool_parser.py index 69a4cc8b9..43597b1d1 100644 --- a/tests/tool_parsers/test_deepseekv31_tool_parser.py +++ b/tests/tool_parsers/test_deepseekv31_tool_parser.py @@ -24,9 +24,9 @@ def parser(deepseekv31_tokenizer): def test_extract_tool_calls_with_tool(parser): model_output = ( "normal text" - + "<|tool▁calls▁begin|>" - + '<|tool▁call▁begin|>foo<|tool▁sep|>{"x":1}<|tool▁call▁end|>' - + "<|tool▁calls▁end|>" + "<|tool▁calls▁begin|>" + '<|tool▁call▁begin|>foo<|tool▁sep|>{"x":1}<|tool▁call▁end|>' + "<|tool▁calls▁end|>" ) result = parser.extract_tool_calls(model_output, None) assert result.tools_called @@ -39,11 +39,11 @@ def test_extract_tool_calls_with_tool(parser): def test_extract_tool_calls_with_multiple_tools(parser): model_output = ( "some prefix text" - + "<|tool▁calls▁begin|>" - + '<|tool▁call▁begin|>foo<|tool▁sep|>{"x":1}<|tool▁call▁end|>' - + '<|tool▁call▁begin|>bar<|tool▁sep|>{"y":2}<|tool▁call▁end|>' - + "<|tool▁calls▁end|>" - + " some suffix text" + "<|tool▁calls▁begin|>" + '<|tool▁call▁begin|>foo<|tool▁sep|>{"x":1}<|tool▁call▁end|>' + '<|tool▁call▁begin|>bar<|tool▁sep|>{"y":2}<|tool▁call▁end|>' + "<|tool▁calls▁end|>" + " some suffix text" ) result = parser.extract_tool_calls(model_output, None) diff --git a/tests/utils.py b/tests/utils.py index d263c7ef0..efeceba63 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1302,7 +1302,7 @@ def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)): indices.append(idx) prompt = ( "```python\n# We set a number of variables, " - + f"x{idx} will be important later\n" + f"x{idx} will be important later\n" ) ln = random.randint(*ln_range) for k in range(30, ln): diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 60062aa5d..f0a1baf3d 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -134,8 +134,7 @@ class BenchmarkDataset(ABC): content.append(mm_content) else: raise TypeError( - "Could not process multimodal content of type: " - + f"{type(mm_content)}" + f"Could not process multimodal content of type: {type(mm_content)}" ) return [{"role": "user", "content": content}] diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 62574d807..4c1b04666 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -208,8 +208,8 @@ class TorchCompileWithNoGuardsWrapper: if not hasattr(self._compiled_callable, "aot_compile"): raise RuntimeError( "aot_compile is not supported by the current configuration. " - + "Please make sure torch.compile is enabled with the latest " - + f"version of PyTorch (current using torch: {torch.__version__})" + "Please make sure torch.compile is enabled with the latest " + f"version of PyTorch (current using torch: {torch.__version__})" ) return self._compiled_callable.aot_compile((args, kwargs)) diff --git a/vllm/entrypoints/openai/translations/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py index 1e934aab8..9f92bf559 100644 --- a/vllm/entrypoints/openai/translations/speech_to_text.py +++ b/vllm/entrypoints/openai/translations/speech_to_text.py @@ -406,8 +406,8 @@ class OpenAISpeechToText(OpenAIServing): if request.response_format not in ["text", "json", "verbose_json"]: return self.create_error_response( - ("Currently only support response_format") - + ("`text`, `json` or `verbose_json`") + "Currently only support response_format: " + "`text`, `json` or `verbose_json`" ) if ( diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py index f806da9c1..1945a1e43 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py @@ -32,8 +32,8 @@ class AiterInt8ScaledMMLinearKernel(CutlassInt8ScaledMMLinearKernel): return ( False, "requires setting `VLLM_ROCM_USE_AITER=1` " - + "and `VLLM_ROCM_USE_AITER_LINEAR=1`. " - + "`VLLM_ROCM_USE_AITER_LINEAR` default is True.", + "and `VLLM_ROCM_USE_AITER_LINEAR=1`. " + "`VLLM_ROCM_USE_AITER_LINEAR` default is True.", ) return True, None @@ -97,9 +97,9 @@ class AiterInt8ScaledMMLinearKernel(CutlassInt8ScaledMMLinearKernel): per_token_scale_a and per_channel_scale_b ), ( "Currently only support per-tensor-per-tensor GEMM " - + " and per-token-per-channel GEMM through AITER" + " and per-token-per-channel GEMM through AITER" " w8a8 scaled gemm. `AiterInt8ScaledMMLinearKernel` " - + "does not support AITER block scaled GEMM." + "does not support AITER block scaled GEMM." ) # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py index 9126c4d77..3808b475e 100644 --- a/vllm/reasoning/olmo3_reasoning_parser.py +++ b/vllm/reasoning/olmo3_reasoning_parser.py @@ -234,7 +234,7 @@ class Olmo3ReasoningParser(ReasoningParser): # reasoning template. reasoning_expr = ( rf"^(?:{self.think_start})?(?P.*?)" - + rf"{self.think_end}(?P.*)$" + rf"{self.think_end}(?P.*)$" ) self.reasoning_regex = re.compile(reasoning_expr, re.DOTALL) diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 99c3ce55b..b912a3d3d 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -216,7 +216,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] # Ensure the persistent buffer is large enough assert n <= self.scheduler_metadata.shape[0], ( f"Scheduler metadata size {n} exceeds buffer size " - + f"{self.scheduler_metadata.shape[0]}" + f"{self.scheduler_metadata.shape[0]}" ) self.scheduler_metadata[:n] = scheduler_metadata # NOTE(woosuk): We should zero out the rest of the scheduler diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 9918d6ffd..96660dc6f 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -646,7 +646,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): """ assert isinstance(kv_cache_spec, ChunkedLocalAttentionSpec), ( "ChunkedLocalAttentionManager can only be used for " - + "chunked local attention groups" + "chunked local attention groups" ) assert use_eagle is False, ( "Hybrid KV cache is not supported for " + "eagle + chunked local attention." diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 2fba48ab0..9f40f41a1 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -618,7 +618,7 @@ class AsyncLLM(EngineClient): except Exception as e2: s = ( f"{e.__class__.__name__}: " - + "error during printing an exception of class" + "error during printing an exception of class" + e2.__class__.__name__ ) logger.info("Request %s failed due to %s.", request_id, s)