[Deprecation][2/N] Replace --task with --runner and --convert (#21470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -12,7 +12,9 @@ def parse_args():
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(
|
||||
model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
|
||||
model="jason9693/Qwen2.5-1.5B-apeach",
|
||||
runner="pooling",
|
||||
enforce_eager=True,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
@@ -27,7 +29,7 @@ def main(args: Namespace):
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="classify" for classification models
|
||||
# You should pass runner="pooling" for classification models
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate logits. The output is a list of ClassificationRequestOutputs.
|
||||
|
||||
@@ -13,7 +13,7 @@ def parse_args():
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(
|
||||
model="intfloat/e5-mistral-7b-instruct",
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
enforce_eager=True,
|
||||
max_model_len=1024,
|
||||
)
|
||||
@@ -30,7 +30,7 @@ def main(args: Namespace):
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
# You should pass runner="pooling" for embedding models
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
|
||||
@@ -12,7 +12,9 @@ def parse_args():
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(
|
||||
model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
|
||||
model="BAAI/bge-reranker-v2-m3",
|
||||
runner="pooling",
|
||||
enforce_eager=True,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
@@ -26,7 +28,7 @@ def main(args: Namespace):
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="score" for cross-encoder models
|
||||
# You should pass runner="pooling" for cross-encoder models
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate scores. The output is a list of ScoringRequestOutputs.
|
||||
|
||||
@@ -12,7 +12,9 @@ def parse_args():
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(
|
||||
model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
|
||||
model="jinaai/jina-embeddings-v3",
|
||||
runner="pooling",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
@@ -29,7 +31,7 @@ def main(args: Namespace):
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
# You should pass runner="pooling" for embedding models
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
|
||||
@@ -12,7 +12,9 @@ def parse_args():
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(
|
||||
model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
|
||||
model="jinaai/jina-embeddings-v3",
|
||||
runner="pooling",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
@@ -29,7 +31,7 @@ def main(args: Namespace):
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
# You should pass runner="pooling" for embedding models
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
|
||||
@@ -17,7 +17,7 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
|
||||
# Models converted offline using this method can not only be more efficient
|
||||
# and support the vllm score API, but also make the init parameters more
|
||||
# concise, for example.
|
||||
# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
|
||||
# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
|
||||
|
||||
# If you want to load the official original version, the init parameters are
|
||||
# as follows.
|
||||
@@ -27,7 +27,7 @@ def get_llm() -> LLM:
|
||||
"""Initializes and returns the LLM model for Qwen3-Reranker."""
|
||||
return LLM(
|
||||
model=model_name,
|
||||
task="score",
|
||||
runner="pooling",
|
||||
hf_overrides={
|
||||
"architectures": ["Qwen3ForSequenceClassification"],
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
|
||||
@@ -70,7 +70,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="royokong/e5-v",
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
@@ -102,7 +102,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={"num_crops": 4},
|
||||
@@ -122,7 +122,7 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="jinaai/jina-reranker-m0",
|
||||
task="score",
|
||||
runner="pooling",
|
||||
max_model_len=32768,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={
|
||||
|
||||
@@ -9,7 +9,7 @@ Launch the vLLM server with the following command:
|
||||
vllm serve llava-hf/llava-1.5-7b-hf
|
||||
|
||||
(multi-image inference with Phi-3.5-vision-instruct)
|
||||
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
|
||||
vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
|
||||
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
|
||||
|
||||
(audio inference with Ultravox)
|
||||
|
||||
@@ -92,7 +92,7 @@ def dse_qwen2_vl(inp: dict):
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
"Script to call a specified VLM through the API. Make sure to serve "
|
||||
"the model with --task embed before running this."
|
||||
"the model with `--runner pooling` before running this."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"""
|
||||
Example online usage of Score API.
|
||||
|
||||
Run `vllm serve <model> --task score` to start up the server in vLLM.
|
||||
Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"""
|
||||
Example online usage of Score API.
|
||||
|
||||
Run `vllm serve <model> --task score` to start up the server in vLLM.
|
||||
Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"""
|
||||
Example online usage of Pooling API.
|
||||
|
||||
Run `vllm serve <model> --task <embed|classify|reward|score>`
|
||||
Run `vllm serve <model> --runner pooling`
|
||||
to start up the server in vLLM.
|
||||
"""
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ This script demonstrates how to:
|
||||
|
||||
Run the vLLM server first:
|
||||
vllm serve meta-llama/Llama-3.2-1B-Instruct \
|
||||
--task generate \
|
||||
--runner generate \
|
||||
--max-model-len 4096 \
|
||||
--enable-prompt-embeds
|
||||
|
||||
|
||||
Reference in New Issue
Block a user