diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 831b76b66..a69637bfc 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = { "sglang": async_request_openai_completions, "llama.cpp": async_request_openai_completions, } - -OPENAI_COMPATIBLE_BACKENDS = [ - k - for k, v in ASYNC_REQUEST_FUNCS.items() - if v in (async_request_openai_completions, async_request_openai_chat_completions) -] diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index f0d661f9d..5865473e9 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,78 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import json -import math -import os import time from types import TracebackType -from typing import Any - - -def convert_to_pytorch_benchmark_format( - args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any] -) -> list: - """ - Save the benchmark results in the format used by PyTorch OSS benchmark with - on metric per record - https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database - """ - records = [] - if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): - return records - - for name, benchmark_values in metrics.items(): - record = { - "benchmark": { - "name": "vLLM benchmark", - "extra_info": { - "args": vars(args), - }, - }, - "model": { - "name": args.model, - }, - "metric": { - "name": name, - "benchmark_values": benchmark_values, - "extra_info": extra_info, - }, - } - - tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") - # Save tensor_parallel_size parameter if it's part of the metadata - if not tp and "tensor_parallel_size" in extra_info: - record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = ( - extra_info["tensor_parallel_size"] - ) - - records.append(record) - - return records - - -class InfEncoder(json.JSONEncoder): - def clear_inf(self, o: Any): - if isinstance(o, dict): - return {k: self.clear_inf(v) for k, v in o.items()} - elif isinstance(o, list): - return [self.clear_inf(v) for v in o] - elif isinstance(o, float) and math.isinf(o): - return "inf" - return o - - def iterencode(self, o: Any, *args, **kwargs) -> Any: - return super().iterencode(self.clear_inf(o), *args, **kwargs) - - -def write_to_json(filename: str, records: list) -> None: - with open(filename, "w") as f: - json.dump( - records, - f, - cls=InfEncoder, - default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", - ) # Collect time and generate time metrics diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index b4f3c6bf9..6cbcf6b68 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Cutlass bench utils -from collections.abc import Iterable import torch @@ -86,15 +85,3 @@ def make_rand_sparse_tensors( # Compressed B, Metadata, Original A, B return b_compressed, e, a, b - - -def make_n_rand_sparse_tensors( - num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int -) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: - ABs = [] - for _ in range(num_tensors): - b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) - if b_comp is not None: - ABs.append(make_rand_sparse_tensors(dtype, m, n, k)) - BComps, Es, As, Bs = zip(*ABs) - return list(BComps), list(Es), list(As), list(Bs) diff --git a/benchmarks/disagg_benchmarks/rate_limiter.py b/benchmarks/disagg_benchmarks/rate_limiter.py deleted file mode 100644 index 87ac8cb6a..000000000 --- a/benchmarks/disagg_benchmarks/rate_limiter.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import time - - -class RateLimiter: - """Token bucket rate limiter implementation""" - - def __init__(self, rate_limit): - self.rate_limit = rate_limit # Requests per second - self.num_available_tokens = rate_limit # Available tokens - self.last_refill = time.monotonic() # Last token refill time - self.lock = asyncio.Lock() # Synchronization lock - - async def acquire(self): - """Acquire a token from the rate limiter""" - while True: - async with self.lock: - current_time = time.monotonic() - elapsed = current_time - self.last_refill - - # Refill num_available_tokens if more than 1 second has passed - if elapsed > 1.0: - self.num_available_tokens = self.rate_limit - self.last_refill = current_time - - # Check if num_available_tokens are available - if self.num_available_tokens > 0: - self.num_available_tokens -= 1 - return True - - # Calculate wait time if no num_available_tokens available - wait_time = 1.0 - elapsed - await asyncio.sleep(wait_time) - - async def __aenter__(self): - """Enter async context manager - acquire token""" - await self.acquire() - return self - - async def __aexit__(self, exc_type, exc_value, traceback): - """Exit async context manager - no cleanup needed""" - pass diff --git a/benchmarks/disagg_benchmarks/request_queue.py b/benchmarks/disagg_benchmarks/request_queue.py deleted file mode 100644 index 410bcb956..000000000 --- a/benchmarks/disagg_benchmarks/request_queue.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -from collections import deque - - -class RequestQueue: - """Request queue manager with concurrency control""" - - def __init__(self, max_concurrent, max_queue_size): - # Maximum concurrent requests - self.max_concurrent = max_concurrent - self.max_queue_size = max_queue_size # Maximum queue size - # Concurrency control - self.semaphore = asyncio.Semaphore(max_concurrent) - self.queue = deque() # Request queue - self.queue_size = 0 # Current queue size - self.lock = asyncio.Lock() # Sync queue Lock - - async def enqueue(self, task): - """Add a request task to the queue""" - async with self.lock: - if self.queue_size >= self.max_queue_size: - return False - - self.queue.append(task) - self.queue_size += 1 - return True - - async def process(self): - """Process queued requests using semaphore for concurrency control""" - while True: - if self.queue: - async with self.semaphore, self.lock: - task = self.queue.popleft() - self.queue_size -= 1 - await task - await asyncio.sleep(0.01) # Yield control to event loop diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index 76410f2e4..5d7b7b54a 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -7,7 +7,6 @@ import torch from torch.nn.parameter import Parameter from vllm import _custom_ops as ops -from vllm.logger import init_logger from vllm.model_executor.kernels.linear import ( init_fp8_linear_kernel, ) @@ -26,10 +25,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ) from vllm.platforms import current_platform -ACTIVATION_SCHEMES = ["static", "dynamic"] - -logger = init_logger(__name__) - class PTPCFp8Config(Fp8Config): """Config class for Per-Token-Per-Channel Dynamic Quantization Fp8.""" diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index c1147725c..23ccfc536 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -255,18 +255,6 @@ def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tenso return w2_packed.size(1) * marlin_tile_size -def marlin_make_workspace( - output_size_per_partition: int, device: torch.device -) -> torch.Tensor: - max_workspace_size = ( - output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N - ) * GPTQ_MARLIN_MAX_PARALLEL - - return torch.zeros( - max_workspace_size, dtype=torch.int, device=device, requires_grad=False - ) - - def marlin_make_workspace_new( device: torch.device, max_blocks_per_sm: int = 1 ) -> torch.Tensor: @@ -297,12 +285,6 @@ def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor: ) -def marlin_make_empty_zp(device: torch.device) -> torch.Tensor: - return torch.nn.Parameter( - torch.empty(0, dtype=torch.int, device=device), requires_grad=False - ) - - def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: g_idx_sort_indices = torch.argsort(g_idx).to(torch.int) return g_idx[g_idx_sort_indices], g_idx_sort_indices diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 1fb0d5e5d..5b0dfe457 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -49,7 +49,6 @@ from .utils import ( ) from .vision import get_vision_encoder_info -EOT = "<|endofturn|>" IMAGE_TOKEN: str = "<|dummy3|>" VIDEO_TOKEN: str = "<|_unuse_missing_100270|>" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a55f1975e..39515cab7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -72,7 +72,6 @@ from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) -POLLING_TIMEOUT_S = 2.5 HANDSHAKE_TIMEOUT_MINS = 5 _R = TypeVar("_R") # Return type for collective_rpc