[UX] Add --performance-mode {balanced,interactivity,throughput} (#34936)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2026-02-25 20:28:31 -05:00
committed by GitHub
parent 6831650c40
commit cbf8f7028c
2 changed files with 37 additions and 5 deletions

View File

@@ -14,7 +14,7 @@ from datetime import datetime
from enum import IntEnum
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeVar, get_args
from typing import TYPE_CHECKING, Any, Literal, TypeVar, get_args
import torch
from pydantic import ConfigDict, Field, model_validator
@@ -76,6 +76,8 @@ class OptimizationLevel(IntEnum):
"""O3: Currently the same as -O2s."""
PerformanceMode = Literal["balanced", "interactivity", "throughput"]
IS_QUANTIZED = False
IS_DENSE = False
# The optimizations that depend on these properties currently set to False
@@ -312,6 +314,13 @@ class VllmConfig:
performance. -O2 is used by default. See OptimizationLevel for full
description."""
performance_mode: PerformanceMode = "balanced"
"""Performance mode for runtime behavior, 'balanced' is the default.
'interactivity' favors low end-to-end per-request latency at small batch
sizes (fine-grained CUDA graphs, latency-oriented kernels).
'throughput' favors aggregate tokens/sec at high concurrency (larger CUDA
graphs, more aggressive batching, throughput-oriented kernels)."""
weight_transfer_config: WeightTransferConfig | None = None
"""The configurations for weight transfer during RL training."""
@@ -643,6 +652,11 @@ class VllmConfig:
# To give each torch profile run a unique instance name.
self.instance_id = f"{time.time_ns()}"
if self.performance_mode != "balanced":
logger.info_once(
"Performance mode set to '%s'.", self.performance_mode, scope="local"
)
self.try_verify_and_update_config()
if self.model_config is not None:
@@ -1332,9 +1346,15 @@ class VllmConfig:
# sort to make sure the sizes are in ascending order
cudagraph_capture_sizes.sort()
else:
cudagraph_capture_sizes = [
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
]
if self.performance_mode == "interactivity":
# Fine-grained CUDA graphs at small batch sizes
# for minimal padding overhead
interactivity_max = min(max_cudagraph_capture_size, 32)
cudagraph_capture_sizes = list(range(1, interactivity_max + 1))
else:
cudagraph_capture_sizes = [
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
]
if max_cudagraph_capture_size >= 8:
# Step size 8 for small batch sizes, up to 256(not included)
cudagraph_capture_sizes += list(
@@ -1345,6 +1365,8 @@ class VllmConfig:
cudagraph_capture_sizes += list(
range(256, max_cudagraph_capture_size + 1, 16)
)
# de-duplicate and sort the sizes
cudagraph_capture_sizes = sorted(set(cudagraph_capture_sizes))
if (
self.parallel_config.tensor_parallel_size > 1