From dcf8862fd47624ec48a6e3a06ff2bcc53dc4d4a0 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 9 Mar 2026 11:22:53 +0800 Subject: [PATCH] [Examples][1/n] Resettle basic examples. (#35579) Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../scripts/hardware_ci/run-cpu-test-arm.sh | 2 +- .../hardware_ci/run-cpu-test-ppc64le.sh | 2 +- .../scripts/hardware_ci/run-gh200-test.sh | 2 +- .../scripts/hardware_ci/run-hpu-test.sh | 2 +- .../scripts/hardware_ci/run-xpu-test.sh | 18 +++++------ .buildkite/test-amd.yaml | 30 +++++++++---------- .buildkite/test_areas/kernels.yaml | 2 +- .buildkite/test_areas/misc.yaml | 13 ++++---- .buildkite/test_areas/models_basic.yaml | 2 +- docs/getting_started/installation/cpu.md | 2 +- docs/getting_started/quickstart.md | 4 +-- docs/models/generative_models.md | 4 +-- docs/models/pooling_models.md | 8 ++--- docs/serving/openai_compatible_server.md | 4 +-- .../offline_inference}/README.md | 14 ++++----- .../offline_inference}/basic.py | 0 .../basic => basic/offline_inference}/chat.py | 0 .../offline_inference}/classify.py | 0 .../offline_inference}/embed.py | 7 ++--- .../offline_inference}/generate.py | 0 .../offline_inference}/reward.py | 7 ++--- .../offline_inference}/score.py | 0 .../openai_chat_completion_client.py | 0 .../openai_completion_client.py | 0 tests/plugins_tests/test_platform_plugins.py | 2 +- vllm/utils/print_utils.py | 4 +-- 26 files changed, 64 insertions(+), 65 deletions(-) rename examples/{offline_inference/basic => basic/offline_inference}/README.md (88%) rename examples/{offline_inference/basic => basic/offline_inference}/basic.py (100%) rename examples/{offline_inference/basic => basic/offline_inference}/chat.py (100%) rename examples/{offline_inference/basic => basic/offline_inference}/classify.py (100%) rename examples/{offline_inference/basic => basic/offline_inference}/embed.py (85%) rename examples/{offline_inference/basic => basic/offline_inference}/generate.py (100%) rename examples/{offline_inference/basic => basic/offline_inference}/reward.py (86%) rename examples/{offline_inference/basic => basic/offline_inference}/score.py (100%) rename examples/{ => basic}/online_serving/openai_chat_completion_client.py (100%) rename examples/{ => basic}/online_serving/openai_completion_client.py (100%) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh index b6274d698..528385d50 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -34,7 +34,7 @@ function cpu_tests() { # offline inference docker exec cpu-test bash -c " set -e - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" # Run model tests docker exec cpu-test bash -c " diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 75ae2765e..e82baed05 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -27,7 +27,7 @@ function cpu_tests() { podman exec -it "$container_id" bash -c " export TORCH_COMPILE_DISABLE=1 set -xve - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log # Run basic model test podman exec -it "$container_id" bash -c " diff --git a/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh index f69e4b066..06e0f7af8 100644 --- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh +++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh @@ -25,5 +25,5 @@ remove_docker_container # Run the image and test offline inference docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B + python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B ' diff --git a/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh index a0b040170..10df07b20 100644 --- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh @@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \ -e PT_HPU_LAZY_MODE=1 \ "${image_name}" \ /bin/bash -c ' - cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m + cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m ' EXITCODE=$? diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index c1164bf43..be7886354 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -34,15 +34,15 @@ docker run \ set -e echo $ZE_AFFINITY_MASK pip install tblib==3.1.0 - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 - python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager - python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 - python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 + python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager + python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 + python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel cd tests pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py pytest -v -s v1/engine diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 9323310b4..ad11f3764 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -529,12 +529,12 @@ steps: commands: - pip install tensorizer # for tensorizer test # for basic - - python3 offline_inference/basic/chat.py - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + - python3 basic/offline_inference/chat.py + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 @@ -1169,7 +1169,7 @@ steps: - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - - python3 examples/offline_inference/basic/chat.py + - python3 examples/basic/offline_inference/chat.py # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper @@ -2208,12 +2208,12 @@ steps: commands: - pip install tensorizer # for tensorizer test # for basic - - python3 offline_inference/basic/chat.py - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + - python3 basic/offline_inference/chat.py + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 @@ -2789,7 +2789,7 @@ steps: - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - - python3 examples/offline_inference/basic/chat.py + - python3 examples/basic/offline_inference/chat.py # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper @@ -2816,7 +2816,7 @@ steps: - vllm/platforms/cuda.py commands: - rocm-smi - - python3 examples/offline_inference/basic/chat.py + - python3 examples/basic/offline_inference/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - pytest -v -s tests/kernels/attention/test_attention_selector.py diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index 566f4f222..9328cad4b 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -96,7 +96,7 @@ steps: - vllm/platforms/cuda.py commands: - nvidia-smi - - python3 examples/offline_inference/basic/chat.py + - python3 examples/basic/offline_inference/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - pytest -v -s tests/kernels/attention/test_attention_selector.py diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index dd14a1eac..2643322bf 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -67,12 +67,13 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference/basic/chat.py # for basic - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + # for basic + - python3 basic/offline_inference/chat.py + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index de0f3994d..c1cc9e9a3 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -65,7 +65,7 @@ steps: - pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py - - python3 examples/offline_inference/basic/chat.py + - python3 examples/basic/offline_inference/chat.py - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 0a62d440d..7225d1d6c 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -259,7 +259,7 @@ ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-ll # On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15 $ export VLLM_CPU_OMP_THREADS_BIND=0-7 - $ python examples/offline_inference/basic/basic.py + $ python examples/basic/offline_inference/basic.py ``` - When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access. diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 40b6dab06..dff86b7d9 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -75,7 +75,7 @@ This guide will help you quickly get started with vLLM to perform: ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py) +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py) The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]: @@ -228,7 +228,7 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep print("Completion result:", completion) ``` -A more detailed client example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py) +A more detailed client example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py) ### OpenAI Chat Completions API with vLLM diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index 99914327e..76dba5977 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -59,7 +59,7 @@ for output in outputs: By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. -A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py) +A code example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py) ### `LLM.beam_search` @@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py) +A code example can be found here: [examples/basic/offline_inference/chat.py](../../examples/basic/offline_inference/chat.py) If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 475493f48..9bc402d23 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -99,7 +99,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py) +A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py) ### `LLM.classify` @@ -116,7 +116,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py) +A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py) ### `LLM.score` @@ -140,7 +140,7 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py) +A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py) ### `LLM.reward` @@ -156,7 +156,7 @@ data = output.outputs.data print(f"Data: {data!r}") ``` -A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py) +A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py) ### `LLM.encode` diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 993214865..b8787c765 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -190,7 +190,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py) +Code example: [examples/basic/online_serving/openai_completion_client.py](../../examples/basic/online_serving/openai_completion_client.py) #### Extra parameters @@ -221,7 +221,7 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf - *Note: `image_url.detail` parameter is not supported.* -Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py) +Code example: [examples/basic/online_serving/openai_chat_completion_client.py](../../examples/basic/online_serving/openai_chat_completion_client.py) #### Extra parameters diff --git a/examples/offline_inference/basic/README.md b/examples/basic/offline_inference/README.md similarity index 88% rename from examples/offline_inference/basic/README.md rename to examples/basic/offline_inference/README.md index 3eedeb725..026c7ec99 100644 --- a/examples/offline_inference/basic/README.md +++ b/examples/basic/offline_inference/README.md @@ -1,4 +1,4 @@ -# Basic +# Offline Inference The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server. @@ -7,31 +7,31 @@ The `LLM` class provides the primary Python interface for doing offline inferenc The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here. ```bash -python examples/offline_inference/basic/basic.py +python examples/basic/offline_inference/basic.py ``` The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments. ```bash -python examples/offline_inference/basic/classify.py +python examples/basic/offline_inference/classify.py ``` ```bash -python examples/offline_inference/basic/embed.py +python examples/basic/offline_inference/embed.py ``` ```bash -python examples/offline_inference/basic/score.py +python examples/basic/offline_inference/score.py ``` The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`. ```bash -python examples/offline_inference/basic/chat.py +python examples/basic/offline_inference/chat.py ``` ```bash -python examples/offline_inference/basic/generate.py +python examples/basic/offline_inference/generate.py ``` ## Features diff --git a/examples/offline_inference/basic/basic.py b/examples/basic/offline_inference/basic.py similarity index 100% rename from examples/offline_inference/basic/basic.py rename to examples/basic/offline_inference/basic.py diff --git a/examples/offline_inference/basic/chat.py b/examples/basic/offline_inference/chat.py similarity index 100% rename from examples/offline_inference/basic/chat.py rename to examples/basic/offline_inference/chat.py diff --git a/examples/offline_inference/basic/classify.py b/examples/basic/offline_inference/classify.py similarity index 100% rename from examples/offline_inference/basic/classify.py rename to examples/basic/offline_inference/classify.py diff --git a/examples/offline_inference/basic/embed.py b/examples/basic/offline_inference/embed.py similarity index 85% rename from examples/offline_inference/basic/embed.py rename to examples/basic/offline_inference/embed.py index eeb7137ff..626c070c1 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/basic/offline_inference/embed.py @@ -5,6 +5,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.print_utils import print_embeddings def parse_args(): @@ -39,10 +40,8 @@ def main(args: Namespace): print("\nGenerated Outputs:\n" + "-" * 60) for prompt, output in zip(prompts, outputs): embeds = output.outputs.embedding - embeds_trimmed = ( - (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds - ) - print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})") + print(f"Prompt: {prompt!r}") + print_embeddings(embeds) print("-" * 60) diff --git a/examples/offline_inference/basic/generate.py b/examples/basic/offline_inference/generate.py similarity index 100% rename from examples/offline_inference/basic/generate.py rename to examples/basic/offline_inference/generate.py diff --git a/examples/offline_inference/basic/reward.py b/examples/basic/offline_inference/reward.py similarity index 86% rename from examples/offline_inference/basic/reward.py rename to examples/basic/offline_inference/reward.py index e95085686..b6aece26a 100644 --- a/examples/offline_inference/basic/reward.py +++ b/examples/basic/offline_inference/reward.py @@ -5,6 +5,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.print_utils import print_embeddings def parse_args(): @@ -41,10 +42,8 @@ def main(args: Namespace): print("\nGenerated Outputs:\n" + "-" * 60) for prompt, output in zip(prompts, outputs): rewards = output.outputs.data - rewards_trimmed = ( - (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards - ) - print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})") + print(f"Prompt: {prompt!r}") + print_embeddings(rewards, prefix="Reward") print("-" * 60) diff --git a/examples/offline_inference/basic/score.py b/examples/basic/offline_inference/score.py similarity index 100% rename from examples/offline_inference/basic/score.py rename to examples/basic/offline_inference/score.py diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/basic/online_serving/openai_chat_completion_client.py similarity index 100% rename from examples/online_serving/openai_chat_completion_client.py rename to examples/basic/online_serving/openai_chat_completion_client.py diff --git a/examples/online_serving/openai_completion_client.py b/examples/basic/online_serving/openai_completion_client.py similarity index 100% rename from examples/online_serving/openai_completion_client.py rename to examples/basic/online_serving/openai_completion_client.py diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index c5ee5cafd..6d32c4c6d 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -17,7 +17,7 @@ def test_platform_plugins(): example_file = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(current_file))), "examples", - "offline_inference/basic/basic.py", + "basic/offline_inference/basic.py", ) runpy.run_path(example_file) diff --git a/vllm/utils/print_utils.py b/vllm/utils/print_utils.py index 8f8af6032..b6ae83be6 100644 --- a/vllm/utils/print_utils.py +++ b/vllm/utils/print_utils.py @@ -2,6 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -def print_embeddings(embeds: list[float]): +def print_embeddings(embeds: list[float], prefix: str = "Embeddings"): embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds - print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})") + print(f"{prefix}: {embeds_trimmed} (size={len(embeds)})")