2025-09-12 08:10:39 +08:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
|
# ruff: noqa
|
|
|
|
|
# type: ignore
|
|
|
|
|
import pytest
|
2026-02-06 01:59:28 +01:00
|
|
|
import time
|
2025-09-12 08:10:39 +08:00
|
|
|
from opentelemetry.sdk.environment_variables import OTEL_EXPORTER_OTLP_TRACES_INSECURE
|
|
|
|
|
|
|
|
|
|
from vllm import LLM, SamplingParams
|
|
|
|
|
from vllm.tracing import SpanAttributes
|
|
|
|
|
|
2026-02-06 01:59:28 +01:00
|
|
|
# Import shared fixtures from the tracing conftest
|
|
|
|
|
from tests.tracing.conftest import ( # noqa: F401
|
|
|
|
|
FAKE_TRACE_SERVER_ADDRESS,
|
|
|
|
|
FakeTraceService,
|
|
|
|
|
trace_service,
|
|
|
|
|
)
|
2025-09-12 08:10:39 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_traces(
|
|
|
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
|
|
|
trace_service: FakeTraceService,
|
|
|
|
|
):
|
|
|
|
|
with monkeypatch.context() as m:
|
|
|
|
|
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
|
2025-10-07 23:42:31 +08:00
|
|
|
|
2025-09-12 08:10:39 +08:00
|
|
|
sampling_params = SamplingParams(
|
|
|
|
|
temperature=0.01,
|
|
|
|
|
top_p=0.1,
|
|
|
|
|
max_tokens=256,
|
|
|
|
|
)
|
|
|
|
|
model = "facebook/opt-125m"
|
|
|
|
|
llm = LLM(
|
|
|
|
|
model=model,
|
|
|
|
|
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
|
|
|
|
|
gpu_memory_utilization=0.3,
|
|
|
|
|
disable_log_stats=False,
|
|
|
|
|
)
|
|
|
|
|
prompts = ["This is a short prompt"]
|
|
|
|
|
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
|
|
|
|
print(f"test_traces outputs is : {outputs}")
|
|
|
|
|
|
2026-02-06 01:59:28 +01:00
|
|
|
# Wait for the "llm_request" span to be exported.
|
|
|
|
|
# The BatchSpanProcessor batches spans and exports them periodically,
|
|
|
|
|
# so we need to wait specifically for the llm_request span to appear.
|
|
|
|
|
timeout = 15
|
|
|
|
|
deadline = time.time() + timeout
|
|
|
|
|
llm_request_spans = []
|
|
|
|
|
while time.time() < deadline:
|
|
|
|
|
all_spans = trace_service.get_all_spans()
|
|
|
|
|
llm_request_spans = [s for s in all_spans if s["name"] == "llm_request"]
|
|
|
|
|
if llm_request_spans:
|
|
|
|
|
break
|
|
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
|
|
|
|
assert len(llm_request_spans) == 1, (
|
|
|
|
|
f"Expected exactly 1 'llm_request' span, but got {len(llm_request_spans)}. "
|
|
|
|
|
f"All span names: {[s['name'] for s in all_spans]}"
|
2025-09-12 08:10:39 +08:00
|
|
|
)
|
|
|
|
|
|
2026-02-06 01:59:28 +01:00
|
|
|
attributes = llm_request_spans[0]["attributes"]
|
2025-09-12 08:10:39 +08:00
|
|
|
# assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
|
|
|
|
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
|
|
|
|
|
assert (
|
|
|
|
|
attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE)
|
|
|
|
|
== sampling_params.temperature
|
|
|
|
|
)
|
|
|
|
|
assert (
|
|
|
|
|
attributes.get(SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
|
2025-10-05 15:06:22 +01:00
|
|
|
)
|
2025-09-12 08:10:39 +08:00
|
|
|
assert (
|
|
|
|
|
attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS)
|
|
|
|
|
== sampling_params.max_tokens
|
|
|
|
|
)
|
|
|
|
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
|
|
|
|
|
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
|
|
|
|
|
outputs[0].prompt_token_ids
|
|
|
|
|
)
|
|
|
|
|
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
|
|
|
|
assert (
|
|
|
|
|
attributes.get(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS)
|
|
|
|
|
== completion_tokens
|
2025-10-05 15:06:22 +01:00
|
|
|
)
|
2025-09-12 08:10:39 +08:00
|
|
|
|
|
|
|
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
|
|
|
|
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
|
|
|
|
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
|