[Core] Add span metrics for model_forward, scheduler and sampler time (#7089)

This commit is contained in:
Mahesh Keralapura
2024-08-09 13:55:13 -07:00
committed by GitHub
parent 70d268a399
commit 933790c209
17 changed files with 189 additions and 21 deletions

View File

@@ -23,8 +23,8 @@ except ImportError:
FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, MultiModalConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig)
ModelConfig, MultiModalConfig, ObservabilityConfig,
ParallelConfig, PromptAdapterConfig, SchedulerConfig)
from vllm.logger import init_logger
from vllm.multimodal import MultiModalInputs
from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
@@ -69,6 +69,7 @@ class TP1DraftModelRunner(ModelRunner):
multimodal_config: Optional[MultiModalConfig] = None,
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
return_hidden_states: bool = False,
observability_config: Optional[ObservabilityConfig] = None,
):
if return_hidden_states:
raise ValueError(
@@ -88,6 +89,7 @@ class TP1DraftModelRunner(ModelRunner):
multimodal_config=multimodal_config,
prompt_adapter_config=prompt_adapter_config,
return_hidden_states=return_hidden_states,
observability_config=observability_config,
)
self.flashinfer_decode_workspace_buffer = None

View File

@@ -1,8 +1,8 @@
from typing import List, Optional
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, MultiModalConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig)
ModelConfig, MultiModalConfig, ObservabilityConfig,
ParallelConfig, PromptAdapterConfig, SchedulerConfig)
from vllm.sequence import SequenceGroupMetadata
from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
ModelRunner)
@@ -32,7 +32,8 @@ class TargetModelRunner(ModelRunner):
is_driver_worker: bool = False,
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
multimodal_config: Optional[MultiModalConfig] = None,
return_hidden_states: bool = False):
return_hidden_states: bool = False,
observability_config: Optional[ObservabilityConfig] = None):
# An internal boolean member variable to indicate if token log
# probabilities are needed or not.
self.disable_logprobs = True
@@ -49,6 +50,7 @@ class TargetModelRunner(ModelRunner):
multimodal_config=multimodal_config,
prompt_adapter_config=prompt_adapter_config,
return_hidden_states=return_hidden_states,
observability_config=observability_config,
)
def prepare_model_input(