[Examples][1/n] Resettle basic examples. (#35579)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -34,7 +34,7 @@ function cpu_tests() {
|
|||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run model tests
|
# Run model tests
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ function cpu_tests() {
|
|||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
export TORCH_COMPILE_DISABLE=1
|
export TORCH_COMPILE_DISABLE=1
|
||||||
set -xve
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
|
|||||||
@@ -25,5 +25,5 @@ remove_docker_container
|
|||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
|
||||||
'
|
'
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
|
|||||||
-e PT_HPU_LAZY_MODE=1 \
|
-e PT_HPU_LAZY_MODE=1 \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
/bin/bash -c '
|
/bin/bash -c '
|
||||||
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
'
|
'
|
||||||
|
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
|
|||||||
@@ -34,15 +34,15 @@ docker run \
|
|||||||
set -e
|
set -e
|
||||||
echo $ZE_AFFINITY_MASK
|
echo $ZE_AFFINITY_MASK
|
||||||
pip install tblib==3.1.0
|
pip install tblib==3.1.0
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||||
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
|
|||||||
@@ -529,12 +529,12 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
# for basic
|
# for basic
|
||||||
- python3 offline_inference/basic/chat.py
|
- python3 basic/offline_inference/chat.py
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 basic/offline_inference/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 basic/offline_inference/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 basic/offline_inference/score.py
|
||||||
# for multi-modal models
|
# for multi-modal models
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
@@ -1169,7 +1169,7 @@ steps:
|
|||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
# - pytest -v -s tests/models/multimodal/processing/
|
# - pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
@@ -2208,12 +2208,12 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
# for basic
|
# for basic
|
||||||
- python3 offline_inference/basic/chat.py
|
- python3 basic/offline_inference/chat.py
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 basic/offline_inference/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 basic/offline_inference/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 basic/offline_inference/score.py
|
||||||
# for multi-modal models
|
# for multi-modal models
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
@@ -2789,7 +2789,7 @@ steps:
|
|||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
# - pytest -v -s tests/models/multimodal/processing/
|
# - pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
@@ -2816,7 +2816,7 @@ steps:
|
|||||||
- vllm/platforms/cuda.py
|
- vllm/platforms/cuda.py
|
||||||
commands:
|
commands:
|
||||||
- rocm-smi
|
- rocm-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
# Attention
|
# Attention
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ steps:
|
|||||||
- vllm/platforms/cuda.py
|
- vllm/platforms/cuda.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
# Attention
|
# Attention
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||||
|
|||||||
@@ -67,12 +67,13 @@ steps:
|
|||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
- python3 offline_inference/basic/chat.py # for basic
|
# for basic
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 basic/offline_inference/chat.py
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 basic/offline_inference/classify.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 basic/offline_inference/embed.py
|
||||||
|
- python3 basic/offline_inference/score.py
|
||||||
# for multi-modal models
|
# for multi-modal models
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ steps:
|
|||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|||||||
@@ -259,7 +259,7 @@ ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-ll
|
|||||||
|
|
||||||
# On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
# On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
||||||
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
||||||
$ python examples/offline_inference/basic/basic.py
|
$ python examples/basic/offline_inference/basic.py
|
||||||
```
|
```
|
||||||
|
|
||||||
- When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
|
- When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ This guide will help you quickly get started with vLLM to perform:
|
|||||||
|
|
||||||
## Offline Batched Inference
|
## Offline Batched Inference
|
||||||
|
|
||||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
|
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
|
||||||
|
|
||||||
The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:
|
The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:
|
||||||
|
|
||||||
@@ -228,7 +228,7 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
|
|||||||
print("Completion result:", completion)
|
print("Completion result:", completion)
|
||||||
```
|
```
|
||||||
|
|
||||||
A more detailed client example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
|
A more detailed client example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
|
||||||
|
|
||||||
### OpenAI Chat Completions API with vLLM
|
### OpenAI Chat Completions API with vLLM
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ for output in outputs:
|
|||||||
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
|
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
|
||||||
|
|
||||||
However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
|
However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
|
||||||
A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
|
A code example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
|
||||||
|
|
||||||
### `LLM.beam_search`
|
### `LLM.beam_search`
|
||||||
|
|
||||||
@@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
|
|||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py)
|
A code example can be found here: [examples/basic/offline_inference/chat.py](../../examples/basic/offline_inference/chat.py)
|
||||||
|
|
||||||
If the model doesn't have a chat template or you want to specify another one,
|
If the model doesn't have a chat template or you want to specify another one,
|
||||||
you can explicitly pass a chat template:
|
you can explicitly pass a chat template:
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ embeds = output.outputs.embedding
|
|||||||
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py)
|
A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py)
|
||||||
|
|
||||||
### `LLM.classify`
|
### `LLM.classify`
|
||||||
|
|
||||||
@@ -116,7 +116,7 @@ probs = output.outputs.probs
|
|||||||
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py)
|
A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py)
|
||||||
|
|
||||||
### `LLM.score`
|
### `LLM.score`
|
||||||
|
|
||||||
@@ -140,7 +140,7 @@ score = output.outputs.score
|
|||||||
print(f"Score: {score}")
|
print(f"Score: {score}")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py)
|
A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py)
|
||||||
|
|
||||||
### `LLM.reward`
|
### `LLM.reward`
|
||||||
|
|
||||||
@@ -156,7 +156,7 @@ data = output.outputs.data
|
|||||||
print(f"Data: {data!r}")
|
print(f"Data: {data!r}")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py)
|
A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py)
|
||||||
|
|
||||||
### `LLM.encode`
|
### `LLM.encode`
|
||||||
|
|
||||||
|
|||||||
@@ -190,7 +190,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs
|
|||||||
Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
|
Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
|
||||||
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
||||||
|
|
||||||
Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py)
|
Code example: [examples/basic/online_serving/openai_completion_client.py](../../examples/basic/online_serving/openai_completion_client.py)
|
||||||
|
|
||||||
#### Extra parameters
|
#### Extra parameters
|
||||||
|
|
||||||
@@ -221,7 +221,7 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf
|
|||||||
|
|
||||||
- *Note: `image_url.detail` parameter is not supported.*
|
- *Note: `image_url.detail` parameter is not supported.*
|
||||||
|
|
||||||
Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py)
|
Code example: [examples/basic/online_serving/openai_chat_completion_client.py](../../examples/basic/online_serving/openai_chat_completion_client.py)
|
||||||
|
|
||||||
#### Extra parameters
|
#### Extra parameters
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Basic
|
# Offline Inference
|
||||||
|
|
||||||
The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
|
The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
|
||||||
|
|
||||||
@@ -7,31 +7,31 @@ The `LLM` class provides the primary Python interface for doing offline inferenc
|
|||||||
The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
|
The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python examples/offline_inference/basic/basic.py
|
python examples/basic/offline_inference/basic.py
|
||||||
```
|
```
|
||||||
|
|
||||||
The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
|
The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python examples/offline_inference/basic/classify.py
|
python examples/basic/offline_inference/classify.py
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python examples/offline_inference/basic/embed.py
|
python examples/basic/offline_inference/embed.py
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python examples/offline_inference/basic/score.py
|
python examples/basic/offline_inference/score.py
|
||||||
```
|
```
|
||||||
|
|
||||||
The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
|
The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python examples/offline_inference/basic/chat.py
|
python examples/basic/offline_inference/chat.py
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python examples/offline_inference/basic/generate.py
|
python examples/basic/offline_inference/generate.py
|
||||||
```
|
```
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
@@ -5,6 +5,7 @@ from argparse import Namespace
|
|||||||
|
|
||||||
from vllm import LLM, EngineArgs
|
from vllm import LLM, EngineArgs
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.print_utils import print_embeddings
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
@@ -39,10 +40,8 @@ def main(args: Namespace):
|
|||||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||||
for prompt, output in zip(prompts, outputs):
|
for prompt, output in zip(prompts, outputs):
|
||||||
embeds = output.outputs.embedding
|
embeds = output.outputs.embedding
|
||||||
embeds_trimmed = (
|
print(f"Prompt: {prompt!r}")
|
||||||
(str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
|
print_embeddings(embeds)
|
||||||
)
|
|
||||||
print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
|
|
||||||
print("-" * 60)
|
print("-" * 60)
|
||||||
|
|
||||||
|
|
||||||
@@ -5,6 +5,7 @@ from argparse import Namespace
|
|||||||
|
|
||||||
from vllm import LLM, EngineArgs
|
from vllm import LLM, EngineArgs
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.print_utils import print_embeddings
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
@@ -41,10 +42,8 @@ def main(args: Namespace):
|
|||||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||||
for prompt, output in zip(prompts, outputs):
|
for prompt, output in zip(prompts, outputs):
|
||||||
rewards = output.outputs.data
|
rewards = output.outputs.data
|
||||||
rewards_trimmed = (
|
print(f"Prompt: {prompt!r}")
|
||||||
(str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards
|
print_embeddings(rewards, prefix="Reward")
|
||||||
)
|
|
||||||
print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})")
|
|
||||||
print("-" * 60)
|
print("-" * 60)
|
||||||
|
|
||||||
|
|
||||||
@@ -17,7 +17,7 @@ def test_platform_plugins():
|
|||||||
example_file = os.path.join(
|
example_file = os.path.join(
|
||||||
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
|
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
|
||||||
"examples",
|
"examples",
|
||||||
"offline_inference/basic/basic.py",
|
"basic/offline_inference/basic.py",
|
||||||
)
|
)
|
||||||
runpy.run_path(example_file)
|
runpy.run_path(example_file)
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
|
||||||
def print_embeddings(embeds: list[float]):
|
def print_embeddings(embeds: list[float], prefix: str = "Embeddings"):
|
||||||
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
|
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
|
||||||
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
print(f"{prefix}: {embeds_trimmed} (size={len(embeds)})")
|
||||||
|
|||||||
Reference in New Issue
Block a user