[Examples][1/n] Resettle basic examples. (#35579)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
wang.yuqi
2026-03-09 11:22:53 +08:00
committed by GitHub
parent 43aa389231
commit dcf8862fd4
26 changed files with 64 additions and 65 deletions

View File

@@ -34,7 +34,7 @@ function cpu_tests() {
# offline inference # offline inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
set -e set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
# Run model tests # Run model tests
docker exec cpu-test bash -c " docker exec cpu-test bash -c "

View File

@@ -27,7 +27,7 @@ function cpu_tests() {
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1 export TORCH_COMPILE_DISABLE=1
set -xve set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
# Run basic model test # Run basic model test
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "

View File

@@ -25,5 +25,5 @@ remove_docker_container
# Run the image and test offline inference # Run the image and test offline inference
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
' '

View File

@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
-e PT_HPU_LAZY_MODE=1 \ -e PT_HPU_LAZY_MODE=1 \
"${image_name}" \ "${image_name}" \
/bin/bash -c ' /bin/bash -c '
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
' '
EXITCODE=$? EXITCODE=$?

View File

@@ -34,15 +34,15 @@ docker run \
set -e set -e
echo $ZE_AFFINITY_MASK echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0 pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine pytest -v -s v1/engine

View File

@@ -529,12 +529,12 @@ steps:
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic # for basic
- python3 offline_inference/basic/chat.py - python3 basic/offline_inference/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/classify.py - python3 basic/offline_inference/classify.py
- python3 offline_inference/basic/embed.py - python3 basic/offline_inference/embed.py
- python3 offline_inference/basic/score.py - python3 basic/offline_inference/score.py
# for multi-modal models # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
@@ -1169,7 +1169,7 @@ steps:
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
# - pytest -v -s tests/models/multimodal/processing/ # - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -2208,12 +2208,12 @@ steps:
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic # for basic
- python3 offline_inference/basic/chat.py - python3 basic/offline_inference/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/classify.py - python3 basic/offline_inference/classify.py
- python3 offline_inference/basic/embed.py - python3 basic/offline_inference/embed.py
- python3 offline_inference/basic/score.py - python3 basic/offline_inference/score.py
# for multi-modal models # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
@@ -2789,7 +2789,7 @@ steps:
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
# - pytest -v -s tests/models/multimodal/processing/ # - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -2816,7 +2816,7 @@ steps:
- vllm/platforms/cuda.py - vllm/platforms/cuda.py
commands: commands:
- rocm-smi - rocm-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
# Attention # Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py - pytest -v -s tests/kernels/attention/test_attention_selector.py

View File

@@ -96,7 +96,7 @@ steps:
- vllm/platforms/cuda.py - vllm/platforms/cuda.py
commands: commands:
- nvidia-smi - nvidia-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
# Attention # Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py - pytest -v -s tests/kernels/attention/test_attention_selector.py

View File

@@ -67,12 +67,13 @@ steps:
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
- python3 offline_inference/basic/chat.py # for basic # for basic
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 basic/offline_inference/chat.py
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/classify.py - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/embed.py - python3 basic/offline_inference/classify.py
- python3 offline_inference/basic/score.py - python3 basic/offline_inference/embed.py
- python3 basic/offline_inference/score.py
# for multi-modal models # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0

View File

@@ -65,7 +65,7 @@ steps:
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py - pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

View File

@@ -259,7 +259,7 @@ ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-ll
# On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15 # On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/basic/basic.py $ python examples/basic/offline_inference/basic.py
``` ```
- When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access. - When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.

View File

@@ -75,7 +75,7 @@ This guide will help you quickly get started with vLLM to perform:
## Offline Batched Inference ## Offline Batched Inference
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py) With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]: The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:
@@ -228,7 +228,7 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
print("Completion result:", completion) print("Completion result:", completion)
``` ```
A more detailed client example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py) A more detailed client example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
### OpenAI Chat Completions API with vLLM ### OpenAI Chat Completions API with vLLM

View File

@@ -59,7 +59,7 @@ for output in outputs:
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py) A code example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
### `LLM.beam_search` ### `LLM.beam_search`
@@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py) A code example can be found here: [examples/basic/offline_inference/chat.py](../../examples/basic/offline_inference/chat.py)
If the model doesn't have a chat template or you want to specify another one, If the model doesn't have a chat template or you want to specify another one,
you can explicitly pass a chat template: you can explicitly pass a chat template:

View File

@@ -99,7 +99,7 @@ embeds = output.outputs.embedding
print(f"Embeddings: {embeds!r} (size={len(embeds)})") print(f"Embeddings: {embeds!r} (size={len(embeds)})")
``` ```
A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py) A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py)
### `LLM.classify` ### `LLM.classify`
@@ -116,7 +116,7 @@ probs = output.outputs.probs
print(f"Class Probabilities: {probs!r} (size={len(probs)})") print(f"Class Probabilities: {probs!r} (size={len(probs)})")
``` ```
A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py) A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py)
### `LLM.score` ### `LLM.score`
@@ -140,7 +140,7 @@ score = output.outputs.score
print(f"Score: {score}") print(f"Score: {score}")
``` ```
A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py) A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py)
### `LLM.reward` ### `LLM.reward`
@@ -156,7 +156,7 @@ data = output.outputs.data
print(f"Data: {data!r}") print(f"Data: {data!r}")
``` ```
A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py) A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py)
### `LLM.encode` ### `LLM.encode`

View File

@@ -190,7 +190,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs
Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py) Code example: [examples/basic/online_serving/openai_completion_client.py](../../examples/basic/online_serving/openai_completion_client.py)
#### Extra parameters #### Extra parameters
@@ -221,7 +221,7 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf
- *Note: `image_url.detail` parameter is not supported.* - *Note: `image_url.detail` parameter is not supported.*
Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py) Code example: [examples/basic/online_serving/openai_chat_completion_client.py](../../examples/basic/online_serving/openai_chat_completion_client.py)
#### Extra parameters #### Extra parameters

View File

@@ -1,4 +1,4 @@
# Basic # Offline Inference
The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server. The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
@@ -7,31 +7,31 @@ The `LLM` class provides the primary Python interface for doing offline inferenc
The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here. The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
```bash ```bash
python examples/offline_inference/basic/basic.py python examples/basic/offline_inference/basic.py
``` ```
The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments. The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
```bash ```bash
python examples/offline_inference/basic/classify.py python examples/basic/offline_inference/classify.py
``` ```
```bash ```bash
python examples/offline_inference/basic/embed.py python examples/basic/offline_inference/embed.py
``` ```
```bash ```bash
python examples/offline_inference/basic/score.py python examples/basic/offline_inference/score.py
``` ```
The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`. The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
```bash ```bash
python examples/offline_inference/basic/chat.py python examples/basic/offline_inference/chat.py
``` ```
```bash ```bash
python examples/offline_inference/basic/generate.py python examples/basic/offline_inference/generate.py
``` ```
## Features ## Features

View File

@@ -5,6 +5,7 @@ from argparse import Namespace
from vllm import LLM, EngineArgs from vllm import LLM, EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.print_utils import print_embeddings
def parse_args(): def parse_args():
@@ -39,10 +40,8 @@ def main(args: Namespace):
print("\nGenerated Outputs:\n" + "-" * 60) print("\nGenerated Outputs:\n" + "-" * 60)
for prompt, output in zip(prompts, outputs): for prompt, output in zip(prompts, outputs):
embeds = output.outputs.embedding embeds = output.outputs.embedding
embeds_trimmed = ( print(f"Prompt: {prompt!r}")
(str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds print_embeddings(embeds)
)
print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
print("-" * 60) print("-" * 60)

View File

@@ -5,6 +5,7 @@ from argparse import Namespace
from vllm import LLM, EngineArgs from vllm import LLM, EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.print_utils import print_embeddings
def parse_args(): def parse_args():
@@ -41,10 +42,8 @@ def main(args: Namespace):
print("\nGenerated Outputs:\n" + "-" * 60) print("\nGenerated Outputs:\n" + "-" * 60)
for prompt, output in zip(prompts, outputs): for prompt, output in zip(prompts, outputs):
rewards = output.outputs.data rewards = output.outputs.data
rewards_trimmed = ( print(f"Prompt: {prompt!r}")
(str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards print_embeddings(rewards, prefix="Reward")
)
print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
print("-" * 60) print("-" * 60)

View File

@@ -17,7 +17,7 @@ def test_platform_plugins():
example_file = os.path.join( example_file = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(current_file))), os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
"examples", "examples",
"offline_inference/basic/basic.py", "basic/offline_inference/basic.py",
) )
runpy.run_path(example_file) runpy.run_path(example_file)

View File

@@ -2,6 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
def print_embeddings(embeds: list[float]): def print_embeddings(embeds: list[float], prefix: str = "Embeddings"):
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})") print(f"{prefix}: {embeds_trimmed} (size={len(embeds)})")