[Examples][1/n] Resettle basic examples. (#35579)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-03-09 11:22:53 +08:00
parent 43aa389231
commit dcf8862fd4
26 changed files with 64 additions and 65 deletions
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"

  # Run model tests
  docker exec cpu-test bash -c "
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container

 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
 '

 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -34,15 +34,15 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -529,12 +529,12 @@ steps:
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/chat.py
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
@@ -1169,7 +1169,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    # - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -2208,12 +2208,12 @@ steps:
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/chat.py
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
@@ -2789,7 +2789,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    # - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -2816,7 +2816,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - rocm-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py 
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -96,7 +96,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -67,12 +67,13 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic/chat.py # for basic
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+     # for basic
+    - python3 basic/offline_inference/chat.py
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -65,7 +65,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -259,7 +259,7 @@ ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-ll

    # On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
    $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-    $ python examples/offline_inference/basic/basic.py
+    $ python examples/basic/offline_inference/basic.py
    ```

 - When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -75,7 +75,7 @@ This guide will help you quickly get started with vLLM to perform:

 ## Offline Batched Inference

-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)

 The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:

@@ -228,7 +228,7 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
    print("Completion result:", completion)
    ```

-A more detailed client example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
+A more detailed client example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)

 ### OpenAI Chat Completions API with vLLM

--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -59,7 +59,7 @@ for output in outputs:
    By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.

    However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
-A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
+A code example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)

 ### `LLM.beam_search`

@@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```

-A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py)
+A code example can be found here: [examples/basic/offline_inference/chat.py](../../examples/basic/offline_inference/chat.py)

 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -99,7 +99,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```

-A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py)
+A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py)

 ### `LLM.classify`

@@ -116,7 +116,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```

-A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py)
+A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py)

 ### `LLM.score`

@@ -140,7 +140,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```

-A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py)
+A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py)

 ### `LLM.reward`

@@ -156,7 +156,7 @@ data = output.outputs.data
 print(f"Data: {data!r}")
 ```

-A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py)
+A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py)

 ### `LLM.encode`

--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -190,7 +190,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.

-Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py)
+Code example: [examples/basic/online_serving/openai_completion_client.py](../../examples/basic/online_serving/openai_completion_client.py)

 #### Extra parameters

@@ -221,7 +221,7 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf

 - *Note: `image_url.detail` parameter is not supported.*

-Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py)
+Code example: [examples/basic/online_serving/openai_chat_completion_client.py](../../examples/basic/online_serving/openai_chat_completion_client.py)

 #### Extra parameters

--- a/examples/basic/offline_inference/README.md
+++ b/examples/basic/offline_inference/README.md
@@ -1,4 +1,4 @@
-# Basic
+# Offline Inference

 The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.

@@ -7,31 +7,31 @@ The `LLM` class provides the primary Python interface for doing offline inferenc
 The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.

 ```bash
-python examples/offline_inference/basic/basic.py
+python examples/basic/offline_inference/basic.py
 ```

 The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.

 ```bash
-python examples/offline_inference/basic/classify.py
+python examples/basic/offline_inference/classify.py
 ```

 ```bash
-python examples/offline_inference/basic/embed.py
+python examples/basic/offline_inference/embed.py
 ```

 ```bash
-python examples/offline_inference/basic/score.py
+python examples/basic/offline_inference/score.py
 ```

 The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.

 ```bash
-python examples/offline_inference/basic/chat.py
+python examples/basic/offline_inference/chat.py
 ```

 ```bash
-python examples/offline_inference/basic/generate.py
+python examples/basic/offline_inference/generate.py
 ```

 ## Features
--- a/examples/basic/offline_inference/basic.py
+++ b/examples/basic/offline_inference/basic.py
--- a/examples/basic/offline_inference/chat.py
+++ b/examples/basic/offline_inference/chat.py
--- a/examples/basic/offline_inference/classify.py
+++ b/examples/basic/offline_inference/classify.py
--- a/examples/basic/offline_inference/embed.py
+++ b/examples/basic/offline_inference/embed.py
@@ -5,6 +5,7 @@ from argparse import Namespace

 from vllm import LLM, EngineArgs
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.print_utils import print_embeddings


 def parse_args():
@@ -39,10 +40,8 @@ def main(args: Namespace):
    print("\nGenerated Outputs:\n" + "-" * 60)
    for prompt, output in zip(prompts, outputs):
        embeds = output.outputs.embedding
-        embeds_trimmed = (
-            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
-        )
-        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
+        print(f"Prompt: {prompt!r}")
+        print_embeddings(embeds)
        print("-" * 60)


--- a/examples/basic/offline_inference/generate.py
+++ b/examples/basic/offline_inference/generate.py
--- a/examples/basic/offline_inference/reward.py
+++ b/examples/basic/offline_inference/reward.py
@@ -5,6 +5,7 @@ from argparse import Namespace

 from vllm import LLM, EngineArgs
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.print_utils import print_embeddings


 def parse_args():
@@ -41,10 +42,8 @@ def main(args: Namespace):
    print("\nGenerated Outputs:\n" + "-" * 60)
    for prompt, output in zip(prompts, outputs):
        rewards = output.outputs.data
-        rewards_trimmed = (
-            (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards
-        )
-        print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
+        print(f"Prompt: {prompt!r}")
+        print_embeddings(rewards, prefix="Reward")
        print("-" * 60)


--- a/examples/basic/offline_inference/score.py
+++ b/examples/basic/offline_inference/score.py
--- a/examples/basic/online_serving/openai_chat_completion_client.py
+++ b/examples/basic/online_serving/openai_chat_completion_client.py
--- a/examples/basic/online_serving/openai_completion_client.py
+++ b/examples/basic/online_serving/openai_completion_client.py
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -17,7 +17,7 @@ def test_platform_plugins():
    example_file = os.path.join(
        os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
        "examples",
-        "offline_inference/basic/basic.py",
+        "basic/offline_inference/basic.py",
    )
    runpy.run_path(example_file)

--- a/vllm/utils/print_utils.py
+++ b/vllm/utils/print_utils.py
@@ -2,6 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project


-def print_embeddings(embeds: list[float]):
+def print_embeddings(embeds: list[float], prefix: str = "Embeddings"):
    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
-    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+    print(f"{prefix}: {embeds_trimmed} (size={len(embeds)})")