[doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
@@ -15,22 +15,24 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \
|
||||
|
||||
To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="token-abc123",
|
||||
)
|
||||
??? Code
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": "Hello!"}
|
||||
]
|
||||
)
|
||||
```python
|
||||
from openai import OpenAI
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="token-abc123",
|
||||
)
|
||||
|
||||
print(completion.choices[0].message)
|
||||
```
|
||||
completion = client.chat.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": "Hello!"}
|
||||
]
|
||||
)
|
||||
|
||||
print(completion.choices[0].message)
|
||||
```
|
||||
|
||||
!!! tip
|
||||
vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
|
||||
@@ -147,27 +149,29 @@ with `--enable-request-id-headers`.
|
||||
> rather than within the vLLM layer for this reason.
|
||||
> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details.
|
||||
|
||||
```python
|
||||
completion = client.chat.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_headers={
|
||||
"x-request-id": "sentiment-classification-00001",
|
||||
}
|
||||
)
|
||||
print(completion._request_id)
|
||||
??? Code
|
||||
|
||||
completion = client.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
prompt="A robot may not injure a human being",
|
||||
extra_headers={
|
||||
"x-request-id": "completion-test",
|
||||
}
|
||||
)
|
||||
print(completion._request_id)
|
||||
```
|
||||
```python
|
||||
completion = client.chat.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_headers={
|
||||
"x-request-id": "sentiment-classification-00001",
|
||||
}
|
||||
)
|
||||
print(completion._request_id)
|
||||
|
||||
completion = client.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
prompt="A robot may not injure a human being",
|
||||
extra_headers={
|
||||
"x-request-id": "completion-test",
|
||||
}
|
||||
)
|
||||
print(completion._request_id)
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
@@ -184,15 +188,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
|
||||
|
||||
The following [sampling parameters][sampling-params] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
|
||||
```
|
||||
|
||||
[](){ #chat-api }
|
||||
|
||||
@@ -212,15 +220,19 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
|
||||
|
||||
The following [sampling parameters][sampling-params] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
|
||||
```
|
||||
|
||||
[](){ #embeddings-api }
|
||||
|
||||
@@ -259,29 +271,31 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
```python
|
||||
import requests
|
||||
??? Code
|
||||
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
```python
|
||||
import requests
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}],
|
||||
"encoding_format": "float",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
```
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}],
|
||||
"encoding_format": "float",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
```
|
||||
|
||||
=== "DSE-Qwen2-MRL"
|
||||
|
||||
@@ -316,15 +330,19 @@ The following [pooling parameters][pooling-params] are supported.
|
||||
|
||||
The following extra parameters are supported by default:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
|
||||
```
|
||||
|
||||
For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
|
||||
```
|
||||
|
||||
[](){ #transcriptions-api }
|
||||
|
||||
@@ -343,15 +361,19 @@ Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
|
||||
|
||||
The following [sampling parameters][sampling-params] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
|
||||
```
|
||||
|
||||
[](){ #tokenizer-api }
|
||||
|
||||
@@ -387,8 +409,6 @@ Code example: <gh-file:examples/online_serving/openai_classification_client.py>
|
||||
|
||||
You can classify multiple texts by passing an array of strings:
|
||||
|
||||
Request:
|
||||
|
||||
```bash
|
||||
curl -v "http://127.0.0.1:8000/classify" \
|
||||
-H "Content-Type: application/json" \
|
||||
@@ -401,47 +421,45 @@ curl -v "http://127.0.0.1:8000/classify" \
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
??? Response
|
||||
|
||||
```bash
|
||||
{
|
||||
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
|
||||
"object": "list",
|
||||
"created": 1745383065,
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"data": [
|
||||
```bash
|
||||
{
|
||||
"index": 0,
|
||||
"label": "Default",
|
||||
"probs": [
|
||||
0.565970778465271,
|
||||
0.4340292513370514
|
||||
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
|
||||
"object": "list",
|
||||
"created": 1745383065,
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"label": "Default",
|
||||
"probs": [
|
||||
0.565970778465271,
|
||||
0.4340292513370514
|
||||
],
|
||||
"num_classes": 2
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"label": "Spoiled",
|
||||
"probs": [
|
||||
0.26448777318000793,
|
||||
0.7355121970176697
|
||||
],
|
||||
"num_classes": 2
|
||||
}
|
||||
],
|
||||
"num_classes": 2
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"label": "Spoiled",
|
||||
"probs": [
|
||||
0.26448777318000793,
|
||||
0.7355121970176697
|
||||
],
|
||||
"num_classes": 2
|
||||
"usage": {
|
||||
"prompt_tokens": 20,
|
||||
"total_tokens": 20,
|
||||
"completion_tokens": 0,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 20,
|
||||
"total_tokens": 20,
|
||||
"completion_tokens": 0,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
You can also pass a string directly to the `input` field:
|
||||
|
||||
Request:
|
||||
|
||||
```bash
|
||||
curl -v "http://127.0.0.1:8000/classify" \
|
||||
-H "Content-Type: application/json" \
|
||||
@@ -451,33 +469,33 @@ curl -v "http://127.0.0.1:8000/classify" \
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
??? Response
|
||||
|
||||
```bash
|
||||
{
|
||||
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
|
||||
"object": "list",
|
||||
"created": 1745383213,
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"data": [
|
||||
```bash
|
||||
{
|
||||
"index": 0,
|
||||
"label": "Default",
|
||||
"probs": [
|
||||
0.565970778465271,
|
||||
0.4340292513370514
|
||||
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
|
||||
"object": "list",
|
||||
"created": 1745383213,
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"label": "Default",
|
||||
"probs": [
|
||||
0.565970778465271,
|
||||
0.4340292513370514
|
||||
],
|
||||
"num_classes": 2
|
||||
}
|
||||
],
|
||||
"num_classes": 2
|
||||
"usage": {
|
||||
"prompt_tokens": 10,
|
||||
"total_tokens": 10,
|
||||
"completion_tokens": 0,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 10,
|
||||
"total_tokens": 10,
|
||||
"completion_tokens": 0,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@@ -508,8 +526,6 @@ Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
|
||||
|
||||
You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
|
||||
|
||||
Request:
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
@@ -523,24 +539,24 @@ curl -X 'POST' \
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
??? Response
|
||||
|
||||
```bash
|
||||
{
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693447,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
```bash
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693447,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
#### Batch inference
|
||||
|
||||
@@ -548,95 +564,95 @@ You can pass a string to `text_1` and a list to `text_2`, forming multiple sente
|
||||
where each pair is built from `text_1` and a string in `text_2`.
|
||||
The total number of pairs is `len(text_2)`.
|
||||
|
||||
Request:
|
||||
??? Request
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"text_1": "What is the capital of France?",
|
||||
"text_2": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris."
|
||||
]
|
||||
}'
|
||||
```
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"text_1": "What is the capital of France?",
|
||||
"text_2": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
??? Response
|
||||
|
||||
```bash
|
||||
{
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693570,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
```bash
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 0.001094818115234375
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693570,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 0.001094818115234375
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
|
||||
where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
|
||||
The total number of pairs is `len(text_2)`.
|
||||
|
||||
Request:
|
||||
??? Request
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"encoding_format": "float",
|
||||
"text_1": [
|
||||
"What is the capital of Brazil?",
|
||||
"What is the capital of France?"
|
||||
],
|
||||
"text_2": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris."
|
||||
]
|
||||
}'
|
||||
```
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"encoding_format": "float",
|
||||
"text_1": [
|
||||
"What is the capital of Brazil?",
|
||||
"What is the capital of France?"
|
||||
],
|
||||
"text_2": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
??? Response
|
||||
|
||||
```bash
|
||||
{
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693447,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
```bash
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693447,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@@ -675,51 +691,51 @@ Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
|
||||
Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
|
||||
Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
|
||||
|
||||
Request:
|
||||
??? Request
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/v1/rerank' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-base",
|
||||
"query": "What is the capital of France?",
|
||||
"documents": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris.",
|
||||
"Horses and cows are both animals"
|
||||
]
|
||||
}'
|
||||
```
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/v1/rerank' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-base",
|
||||
"query": "What is the capital of France?",
|
||||
"documents": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris.",
|
||||
"Horses and cows are both animals"
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
??? Response
|
||||
|
||||
```bash
|
||||
{
|
||||
"id": "rerank-fae51b2b664d4ed38f5969b612edff77",
|
||||
"model": "BAAI/bge-reranker-base",
|
||||
"usage": {
|
||||
"total_tokens": 56
|
||||
},
|
||||
"results": [
|
||||
```bash
|
||||
{
|
||||
"index": 1,
|
||||
"document": {
|
||||
"text": "The capital of France is Paris."
|
||||
"id": "rerank-fae51b2b664d4ed38f5969b612edff77",
|
||||
"model": "BAAI/bge-reranker-base",
|
||||
"usage": {
|
||||
"total_tokens": 56
|
||||
},
|
||||
"relevance_score": 0.99853515625
|
||||
},
|
||||
{
|
||||
"index": 0,
|
||||
"document": {
|
||||
"text": "The capital of Brazil is Brasilia."
|
||||
},
|
||||
"relevance_score": 0.0005860328674316406
|
||||
"results": [
|
||||
{
|
||||
"index": 1,
|
||||
"document": {
|
||||
"text": "The capital of France is Paris."
|
||||
},
|
||||
"relevance_score": 0.99853515625
|
||||
},
|
||||
{
|
||||
"index": 0,
|
||||
"document": {
|
||||
"text": "The capital of Brazil is Brasilia."
|
||||
},
|
||||
"relevance_score": 0.0005860328674316406
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
|
||||
Reference in New Issue
Block a user