From dcf8862fd47624ec48a6e3a06ff2bcc53dc4d4a0 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Mon, 9 Mar 2026 11:22:53 +0800
Subject: [PATCH] [Examples][1/n] Resettle basic examples. (#35579)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../scripts/hardware_ci/run-cpu-test-arm.sh   |  2 +-
 .../hardware_ci/run-cpu-test-ppc64le.sh       |  2 +-
 .../scripts/hardware_ci/run-gh200-test.sh     |  2 +-
 .../scripts/hardware_ci/run-hpu-test.sh       |  2 +-
 .../scripts/hardware_ci/run-xpu-test.sh       | 18 +++++------
 .buildkite/test-amd.yaml                      | 30 +++++++++----------
 .buildkite/test_areas/kernels.yaml            |  2 +-
 .buildkite/test_areas/misc.yaml               | 13 ++++----
 .buildkite/test_areas/models_basic.yaml       |  2 +-
 docs/getting_started/installation/cpu.md      |  2 +-
 docs/getting_started/quickstart.md            |  4 +--
 docs/models/generative_models.md              |  4 +--
 docs/models/pooling_models.md                 |  8 ++---
 docs/serving/openai_compatible_server.md      |  4 +--
 .../offline_inference}/README.md              | 14 ++++-----
 .../offline_inference}/basic.py               |  0
 .../basic => basic/offline_inference}/chat.py |  0
 .../offline_inference}/classify.py            |  0
 .../offline_inference}/embed.py               |  7 ++---
 .../offline_inference}/generate.py            |  0
 .../offline_inference}/reward.py              |  7 ++---
 .../offline_inference}/score.py               |  0
 .../openai_chat_completion_client.py          |  0
 .../openai_completion_client.py               |  0
 tests/plugins_tests/test_platform_plugins.py  |  2 +-
 vllm/utils/print_utils.py                     |  4 +--
 26 files changed, 64 insertions(+), 65 deletions(-)
 rename examples/{offline_inference/basic => basic/offline_inference}/README.md (88%)
 rename examples/{offline_inference/basic => basic/offline_inference}/basic.py (100%)
 rename examples/{offline_inference/basic => basic/offline_inference}/chat.py (100%)
 rename examples/{offline_inference/basic => basic/offline_inference}/classify.py (100%)
 rename examples/{offline_inference/basic => basic/offline_inference}/embed.py (85%)
 rename examples/{offline_inference/basic => basic/offline_inference}/generate.py (100%)
 rename examples/{offline_inference/basic => basic/offline_inference}/reward.py (86%)
 rename examples/{offline_inference/basic => basic/offline_inference}/score.py (100%)
 rename examples/{ => basic}/online_serving/openai_chat_completion_client.py (100%)
 rename examples/{ => basic}/online_serving/openai_completion_client.py (100%)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index b6274d698..528385d50 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test bash -c "
     set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
 
   # Run model tests
   docker exec cpu-test bash -c "
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 75ae2765e..e82baed05 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
   podman exec -it "$container_id" bash -c "
     export TORCH_COMPILE_DISABLE=1
     set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
 
   # Run basic model test
   podman exec -it "$container_id" bash -c "
diff --git a/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
index f69e4b066..06e0f7af8 100644
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
 '
diff --git a/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
index a0b040170..10df07b20 100644
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
   -e PT_HPU_LAZY_MODE=1 \
   "${image_name}" \
   /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
 '
 
 EXITCODE=$?
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index c1164bf43..be7886354 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -34,15 +34,15 @@ docker run \
     set -e
     echo $ZE_AFFINITY_MASK
     pip install tblib==3.1.0
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
     cd tests
     pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
     pytest -v -s v1/engine
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 9323310b4..ad11f3764 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -529,12 +529,12 @@ steps:
   commands:
     - pip install tensorizer # for tensorizer test
     # for basic
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/chat.py
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
     # for multi-modal models
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
@@ -1169,7 +1169,7 @@ steps:
     - pytest -v -s tests/models/test_transformers.py
     # - pytest -v -s tests/models/multimodal/processing/
     - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
     # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -2208,12 +2208,12 @@ steps:
   commands:
     - pip install tensorizer # for tensorizer test
     # for basic
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/chat.py
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
     # for multi-modal models
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
@@ -2789,7 +2789,7 @@ steps:
     - pytest -v -s tests/models/test_transformers.py
     # - pytest -v -s tests/models/multimodal/processing/
     - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
     # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -2816,7 +2816,7 @@ steps:
   - vllm/platforms/cuda.py
   commands:
     - rocm-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
     - pytest -v -s tests/kernels/attention/test_attention_selector.py 
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
index 566f4f222..9328cad4b 100644
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -96,7 +96,7 @@ steps:
   - vllm/platforms/cuda.py
   commands:
     - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
     - pytest -v -s tests/kernels/attention/test_attention_selector.py
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index dd14a1eac..2643322bf 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -67,12 +67,13 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic/chat.py # for basic
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+     # for basic
+    - python3 basic/offline_inference/chat.py
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
     # for multi-modal models
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index de0f3994d..c1cc9e9a3 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -65,7 +65,7 @@ steps:
     - pytest -v -s tests/models/test_transformers.py
     - pytest -v -s tests/models/multimodal/processing/
     - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 0a62d440d..7225d1d6c 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -259,7 +259,7 @@ ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-ll
 
     # On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
     $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-    $ python examples/offline_inference/basic/basic.py
+    $ python examples/basic/offline_inference/basic.py
     ```
 
 - When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 40b6dab06..dff86b7d9 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -75,7 +75,7 @@ This guide will help you quickly get started with vLLM to perform:
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
 
 The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:
 
@@ -228,7 +228,7 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
     print("Completion result:", completion)
     ```
 
-A more detailed client example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
+A more detailed client example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
 
 ### OpenAI Chat Completions API with vLLM
 
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index 99914327e..76dba5977 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -59,7 +59,7 @@ for output in outputs:
     By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
 
     However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
-A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
+A code example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
 
 ### `LLM.beam_search`
 
@@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     ```
 
-A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py)
+A code example can be found here: [examples/basic/offline_inference/chat.py](../../examples/basic/offline_inference/chat.py)
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 475493f48..9bc402d23 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -99,7 +99,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py)
+A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py)
 
 ### `LLM.classify`
 
@@ -116,7 +116,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py)
+A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py)
 
 ### `LLM.score`
 
@@ -140,7 +140,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py)
+A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py)
 
 ### `LLM.reward`
 
@@ -156,7 +156,7 @@ data = output.outputs.data
 print(f"Data: {data!r}")
 ```
 
-A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py)
+A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py)
 
 ### `LLM.encode`
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 993214865..b8787c765 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -190,7 +190,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py)
+Code example: [examples/basic/online_serving/openai_completion_client.py](../../examples/basic/online_serving/openai_completion_client.py)
 
 #### Extra parameters
 
@@ -221,7 +221,7 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf
 
 - *Note: `image_url.detail` parameter is not supported.*
 
-Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py)
+Code example: [examples/basic/online_serving/openai_chat_completion_client.py](../../examples/basic/online_serving/openai_chat_completion_client.py)
 
 #### Extra parameters
 
diff --git a/examples/offline_inference/basic/README.md b/examples/basic/offline_inference/README.md
similarity index 88%
rename from examples/offline_inference/basic/README.md
rename to examples/basic/offline_inference/README.md
index 3eedeb725..026c7ec99 100644
--- a/examples/offline_inference/basic/README.md
+++ b/examples/basic/offline_inference/README.md
@@ -1,4 +1,4 @@
-# Basic
+# Offline Inference
 
 The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
 
@@ -7,31 +7,31 @@ The `LLM` class provides the primary Python interface for doing offline inferenc
 The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
 
 ```bash
-python examples/offline_inference/basic/basic.py
+python examples/basic/offline_inference/basic.py
 ```
 
 The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
 
 ```bash
-python examples/offline_inference/basic/classify.py
+python examples/basic/offline_inference/classify.py
 ```
 
 ```bash
-python examples/offline_inference/basic/embed.py
+python examples/basic/offline_inference/embed.py
 ```
 
 ```bash
-python examples/offline_inference/basic/score.py
+python examples/basic/offline_inference/score.py
 ```
 
 The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
 
 ```bash
-python examples/offline_inference/basic/chat.py
+python examples/basic/offline_inference/chat.py
 ```
 
 ```bash
-python examples/offline_inference/basic/generate.py
+python examples/basic/offline_inference/generate.py
 ```
 
 ## Features
diff --git a/examples/offline_inference/basic/basic.py b/examples/basic/offline_inference/basic.py
similarity index 100%
rename from examples/offline_inference/basic/basic.py
rename to examples/basic/offline_inference/basic.py
diff --git a/examples/offline_inference/basic/chat.py b/examples/basic/offline_inference/chat.py
similarity index 100%
rename from examples/offline_inference/basic/chat.py
rename to examples/basic/offline_inference/chat.py
diff --git a/examples/offline_inference/basic/classify.py b/examples/basic/offline_inference/classify.py
similarity index 100%
rename from examples/offline_inference/basic/classify.py
rename to examples/basic/offline_inference/classify.py
diff --git a/examples/offline_inference/basic/embed.py b/examples/basic/offline_inference/embed.py
similarity index 85%
rename from examples/offline_inference/basic/embed.py
rename to examples/basic/offline_inference/embed.py
index eeb7137ff..626c070c1 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/basic/offline_inference/embed.py
@@ -5,6 +5,7 @@ from argparse import Namespace
 
 from vllm import LLM, EngineArgs
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.print_utils import print_embeddings
 
 
 def parse_args():
@@ -39,10 +40,8 @@ def main(args: Namespace):
     print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         embeds = output.outputs.embedding
-        embeds_trimmed = (
-            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
-        )
-        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
+        print(f"Prompt: {prompt!r}")
+        print_embeddings(embeds)
         print("-" * 60)
 
 
diff --git a/examples/offline_inference/basic/generate.py b/examples/basic/offline_inference/generate.py
similarity index 100%
rename from examples/offline_inference/basic/generate.py
rename to examples/basic/offline_inference/generate.py
diff --git a/examples/offline_inference/basic/reward.py b/examples/basic/offline_inference/reward.py
similarity index 86%
rename from examples/offline_inference/basic/reward.py
rename to examples/basic/offline_inference/reward.py
index e95085686..b6aece26a 100644
--- a/examples/offline_inference/basic/reward.py
+++ b/examples/basic/offline_inference/reward.py
@@ -5,6 +5,7 @@ from argparse import Namespace
 
 from vllm import LLM, EngineArgs
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.print_utils import print_embeddings
 
 
 def parse_args():
@@ -41,10 +42,8 @@ def main(args: Namespace):
     print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         rewards = output.outputs.data
-        rewards_trimmed = (
-            (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards
-        )
-        print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
+        print(f"Prompt: {prompt!r}")
+        print_embeddings(rewards, prefix="Reward")
         print("-" * 60)
 
 
diff --git a/examples/offline_inference/basic/score.py b/examples/basic/offline_inference/score.py
similarity index 100%
rename from examples/offline_inference/basic/score.py
rename to examples/basic/offline_inference/score.py
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/basic/online_serving/openai_chat_completion_client.py
similarity index 100%
rename from examples/online_serving/openai_chat_completion_client.py
rename to examples/basic/online_serving/openai_chat_completion_client.py
diff --git a/examples/online_serving/openai_completion_client.py b/examples/basic/online_serving/openai_completion_client.py
similarity index 100%
rename from examples/online_serving/openai_completion_client.py
rename to examples/basic/online_serving/openai_completion_client.py
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index c5ee5cafd..6d32c4c6d 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -17,7 +17,7 @@ def test_platform_plugins():
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
         "examples",
-        "offline_inference/basic/basic.py",
+        "basic/offline_inference/basic.py",
     )
     runpy.run_path(example_file)
 
diff --git a/vllm/utils/print_utils.py b/vllm/utils/print_utils.py
index 8f8af6032..b6ae83be6 100644
--- a/vllm/utils/print_utils.py
+++ b/vllm/utils/print_utils.py
@@ -2,6 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-def print_embeddings(embeds: list[float]):
+def print_embeddings(embeds: list[float], prefix: str = "Embeddings"):
     embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
-    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+    print(f"{prefix}: {embeds_trimmed} (size={len(embeds)})")