[Misc] Add OpenTelemetry support (#4687)

This PR adds basic support for OpenTelemetry distributed tracing.
It includes changes to enable tracing functionality and improve monitoring capabilities.

I've also added a markdown with print-screens to guide users how to use this feature. You can find it here
This commit is contained in:
Ronen Schaffer
2024-06-18 19:17:03 +03:00
committed by GitHub
parent 13db4369d9
commit 7879f24dcc
15 changed files with 567 additions and 41 deletions

View File

@@ -7,8 +7,9 @@ from typing import List, Optional, Tuple, Union
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, SpeculativeConfig,
TokenizerPoolConfig, VisionLanguageConfig)
ObservabilityConfig, ParallelConfig, SchedulerConfig,
SpeculativeConfig, TokenizerPoolConfig,
VisionLanguageConfig)
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import str_to_int_tuple
@@ -101,6 +102,8 @@ class EngineArgs:
qlora_adapter_name_or_path: Optional[str] = None
otlp_traces_endpoint: Optional[str] = None
def __post_init__(self):
if self.tokenizer is None:
self.tokenizer = self.model
@@ -599,6 +602,13 @@ class EngineArgs:
type=str,
default=None,
help='Name or path of the QLoRA adapter.')
parser.add_argument(
'--otlp-traces-endpoint',
type=str,
default=None,
help='Target URL to which OpenTelemetry traces will be sent.')
return parser
@classmethod
@@ -757,6 +767,9 @@ class EngineArgs:
decoding_config = DecodingConfig(
guided_decoding_backend=self.guided_decoding_backend)
observability_config = ObservabilityConfig(
otlp_traces_endpoint=self.otlp_traces_endpoint)
if (model_config.get_sliding_window() is not None
and scheduler_config.chunked_prefill_enabled
and not scheduler_config.use_v2_block_manager):
@@ -764,16 +777,19 @@ class EngineArgs:
"Chunked prefill is not supported with sliding window. "
"Set --disable-sliding-window to disable sliding window.")
return EngineConfig(model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
device_config=device_config,
lora_config=lora_config,
vision_language_config=vision_language_config,
speculative_config=speculative_config,
load_config=load_config,
decoding_config=decoding_config)
return EngineConfig(
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
device_config=device_config,
lora_config=lora_config,
vision_language_config=vision_language_config,
speculative_config=speculative_config,
load_config=load_config,
decoding_config=decoding_config,
observability_config=observability_config,
)
@dataclass