[Deprecation][2/N] Replace --task with --runner and --convert (#21470)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Cyrus Leung
2025-07-28 10:42:40 +08:00
committed by GitHub
parent 8f605ee309
commit 86ae693f20
94 changed files with 1117 additions and 1083 deletions

View File

@@ -12,7 +12,9 @@ def parse_args():
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
model="jason9693/Qwen2.5-1.5B-apeach",
runner="pooling",
enforce_eager=True,
)
return parser.parse_args()
@@ -27,7 +29,7 @@ def main(args: Namespace):
]
# Create an LLM.
# You should pass task="classify" for classification models
# You should pass runner="pooling" for classification models
llm = LLM(**vars(args))
# Generate logits. The output is a list of ClassificationRequestOutputs.

View File

@@ -13,7 +13,7 @@ def parse_args():
# Set example specific arguments
parser.set_defaults(
model="intfloat/e5-mistral-7b-instruct",
task="embed",
runner="pooling",
enforce_eager=True,
max_model_len=1024,
)
@@ -30,7 +30,7 @@ def main(args: Namespace):
]
# Create an LLM.
# You should pass task="embed" for embedding models
# You should pass runner="pooling" for embedding models
llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.

View File

@@ -12,7 +12,9 @@ def parse_args():
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
model="BAAI/bge-reranker-v2-m3",
runner="pooling",
enforce_eager=True,
)
return parser.parse_args()
@@ -26,7 +28,7 @@ def main(args: Namespace):
]
# Create an LLM.
# You should pass task="score" for cross-encoder models
# You should pass runner="pooling" for cross-encoder models
llm = LLM(**vars(args))
# Generate scores. The output is a list of ScoringRequestOutputs.

View File

@@ -12,7 +12,9 @@ def parse_args():
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
model="jinaai/jina-embeddings-v3",
runner="pooling",
trust_remote_code=True,
)
return parser.parse_args()
@@ -29,7 +31,7 @@ def main(args: Namespace):
]
# Create an LLM.
# You should pass task="embed" for embedding models
# You should pass runner="pooling" for embedding models
llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.

View File

@@ -12,7 +12,9 @@ def parse_args():
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
model="jinaai/jina-embeddings-v3",
runner="pooling",
trust_remote_code=True,
)
return parser.parse_args()
@@ -29,7 +31,7 @@ def main(args: Namespace):
]
# Create an LLM.
# You should pass task="embed" for embedding models
# You should pass runner="pooling" for embedding models
llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.

View File

@@ -17,7 +17,7 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
# Models converted offline using this method can not only be more efficient
# and support the vllm score API, but also make the init parameters more
# concise, for example.
# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
# If you want to load the official original version, the init parameters are
# as follows.
@@ -27,7 +27,7 @@ def get_llm() -> LLM:
"""Initializes and returns the LLM model for Qwen3-Reranker."""
return LLM(
model=model_name,
task="score",
runner="pooling",
hf_overrides={
"architectures": ["Qwen3ForSequenceClassification"],
"classifier_from_token": ["no", "yes"],

View File

@@ -70,7 +70,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
engine_args = EngineArgs(
model="royokong/e5-v",
task="embed",
runner="pooling",
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
)
@@ -102,7 +102,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
engine_args = EngineArgs(
model="TIGER-Lab/VLM2Vec-Full",
task="embed",
runner="pooling",
max_model_len=4096,
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
@@ -122,7 +122,7 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
engine_args = EngineArgs(
model="jinaai/jina-reranker-m0",
task="score",
runner="pooling",
max_model_len=32768,
trust_remote_code=True,
mm_processor_kwargs={

View File

@@ -9,7 +9,7 @@ Launch the vLLM server with the following command:
vllm serve llava-hf/llava-1.5-7b-hf
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
(audio inference with Ultravox)

View File

@@ -92,7 +92,7 @@ def dse_qwen2_vl(inp: dict):
def parse_args():
parser = argparse.ArgumentParser(
"Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this."
"the model with `--runner pooling` before running this."
)
parser.add_argument(
"--model",

View File

@@ -3,7 +3,7 @@
"""
Example online usage of Score API.
Run `vllm serve <model> --task score` to start up the server in vLLM.
Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
"""
import argparse

View File

@@ -3,7 +3,7 @@
"""
Example online usage of Score API.
Run `vllm serve <model> --task score` to start up the server in vLLM.
Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
"""
import argparse

View File

@@ -3,7 +3,7 @@
"""
Example online usage of Pooling API.
Run `vllm serve <model> --task <embed|classify|reward|score>`
Run `vllm serve <model> --runner pooling`
to start up the server in vLLM.
"""

View File

@@ -10,7 +10,7 @@ This script demonstrates how to:
Run the vLLM server first:
vllm serve meta-llama/Llama-3.2-1B-Instruct \
--task generate \
--runner generate \
--max-model-len 4096 \
--enable-prompt-embeds