[Examples][1/n] Resettle basic examples. (#35579)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -34,7 +34,7 @@ function cpu_tests() {
|
||||
# offline inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run model tests
|
||||
docker exec cpu-test bash -c "
|
||||
|
||||
@@ -27,7 +27,7 @@ function cpu_tests() {
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -xve
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
|
||||
@@ -25,5 +25,5 @@ remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
|
||||
'
|
||||
|
||||
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
|
||||
-e PT_HPU_LAZY_MODE=1 \
|
||||
"${image_name}" \
|
||||
/bin/bash -c '
|
||||
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
'
|
||||
|
||||
EXITCODE=$?
|
||||
|
||||
@@ -34,15 +34,15 @@ docker run \
|
||||
set -e
|
||||
echo $ZE_AFFINITY_MASK
|
||||
pip install tblib==3.1.0
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
cd tests
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
||||
pytest -v -s v1/engine
|
||||
|
||||
@@ -529,12 +529,12 @@ steps:
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- python3 basic/offline_inference/chat.py
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 basic/offline_inference/classify.py
|
||||
- python3 basic/offline_inference/embed.py
|
||||
- python3 basic/offline_inference/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
@@ -1169,7 +1169,7 @@ steps:
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
# - pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
@@ -2208,12 +2208,12 @@ steps:
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- python3 basic/offline_inference/chat.py
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 basic/offline_inference/classify.py
|
||||
- python3 basic/offline_inference/embed.py
|
||||
- python3 basic/offline_inference/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
@@ -2789,7 +2789,7 @@ steps:
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
# - pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
@@ -2816,7 +2816,7 @@ steps:
|
||||
- vllm/platforms/cuda.py
|
||||
commands:
|
||||
- rocm-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
|
||||
@@ -96,7 +96,7 @@ steps:
|
||||
- vllm/platforms/cuda.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
|
||||
@@ -67,12 +67,13 @@ steps:
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference/basic/chat.py # for basic
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for basic
|
||||
- python3 basic/offline_inference/chat.py
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 basic/offline_inference/classify.py
|
||||
- python3 basic/offline_inference/embed.py
|
||||
- python3 basic/offline_inference/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
|
||||
@@ -65,7 +65,7 @@ steps:
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
@@ -259,7 +259,7 @@ ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-ll
|
||||
|
||||
# On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
||||
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
||||
$ python examples/offline_inference/basic/basic.py
|
||||
$ python examples/basic/offline_inference/basic.py
|
||||
```
|
||||
|
||||
- When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
|
||||
|
||||
@@ -75,7 +75,7 @@ This guide will help you quickly get started with vLLM to perform:
|
||||
|
||||
## Offline Batched Inference
|
||||
|
||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
|
||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
|
||||
|
||||
The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:
|
||||
|
||||
@@ -228,7 +228,7 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
|
||||
print("Completion result:", completion)
|
||||
```
|
||||
|
||||
A more detailed client example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
|
||||
A more detailed client example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
|
||||
|
||||
### OpenAI Chat Completions API with vLLM
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ for output in outputs:
|
||||
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
|
||||
|
||||
However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
|
||||
A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
|
||||
A code example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
|
||||
|
||||
### `LLM.beam_search`
|
||||
|
||||
@@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py)
|
||||
A code example can be found here: [examples/basic/offline_inference/chat.py](../../examples/basic/offline_inference/chat.py)
|
||||
|
||||
If the model doesn't have a chat template or you want to specify another one,
|
||||
you can explicitly pass a chat template:
|
||||
|
||||
@@ -99,7 +99,7 @@ embeds = output.outputs.embedding
|
||||
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py)
|
||||
A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py)
|
||||
|
||||
### `LLM.classify`
|
||||
|
||||
@@ -116,7 +116,7 @@ probs = output.outputs.probs
|
||||
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py)
|
||||
A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py)
|
||||
|
||||
### `LLM.score`
|
||||
|
||||
@@ -140,7 +140,7 @@ score = output.outputs.score
|
||||
print(f"Score: {score}")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py)
|
||||
A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py)
|
||||
|
||||
### `LLM.reward`
|
||||
|
||||
@@ -156,7 +156,7 @@ data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py)
|
||||
A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py)
|
||||
|
||||
### `LLM.encode`
|
||||
|
||||
|
||||
@@ -190,7 +190,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs
|
||||
Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
|
||||
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
||||
|
||||
Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py)
|
||||
Code example: [examples/basic/online_serving/openai_completion_client.py](../../examples/basic/online_serving/openai_completion_client.py)
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@@ -221,7 +221,7 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf
|
||||
|
||||
- *Note: `image_url.detail` parameter is not supported.*
|
||||
|
||||
Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py)
|
||||
Code example: [examples/basic/online_serving/openai_chat_completion_client.py](../../examples/basic/online_serving/openai_chat_completion_client.py)
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Basic
|
||||
# Offline Inference
|
||||
|
||||
The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
|
||||
|
||||
@@ -7,31 +7,31 @@ The `LLM` class provides the primary Python interface for doing offline inferenc
|
||||
The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/basic.py
|
||||
python examples/basic/offline_inference/basic.py
|
||||
```
|
||||
|
||||
The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/classify.py
|
||||
python examples/basic/offline_inference/classify.py
|
||||
```
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/embed.py
|
||||
python examples/basic/offline_inference/embed.py
|
||||
```
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/score.py
|
||||
python examples/basic/offline_inference/score.py
|
||||
```
|
||||
|
||||
The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/chat.py
|
||||
python examples/basic/offline_inference/chat.py
|
||||
```
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/generate.py
|
||||
python examples/basic/offline_inference/generate.py
|
||||
```
|
||||
|
||||
## Features
|
||||
@@ -5,6 +5,7 @@ from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.print_utils import print_embeddings
|
||||
|
||||
|
||||
def parse_args():
|
||||
@@ -39,10 +40,8 @@ def main(args: Namespace):
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
embeds = output.outputs.embedding
|
||||
embeds_trimmed = (
|
||||
(str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
|
||||
)
|
||||
print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print_embeddings(embeds)
|
||||
print("-" * 60)
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.print_utils import print_embeddings
|
||||
|
||||
|
||||
def parse_args():
|
||||
@@ -41,10 +42,8 @@ def main(args: Namespace):
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
rewards = output.outputs.data
|
||||
rewards_trimmed = (
|
||||
(str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards
|
||||
)
|
||||
print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print_embeddings(rewards, prefix="Reward")
|
||||
print("-" * 60)
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ def test_platform_plugins():
|
||||
example_file = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
|
||||
"examples",
|
||||
"offline_inference/basic/basic.py",
|
||||
"basic/offline_inference/basic.py",
|
||||
)
|
||||
runpy.run_path(example_file)
|
||||
|
||||
|
||||
@@ -2,6 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
def print_embeddings(embeds: list[float]):
|
||||
def print_embeddings(embeds: list[float], prefix: str = "Embeddings"):
|
||||
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
|
||||
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||
print(f"{prefix}: {embeds_trimmed} (size={len(embeds)})")
|
||||
|
||||
Reference in New Issue
Block a user