[Deprecation][2/N] Replace --task with --runner and --convert (#21470)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-28 10:42:40 +08:00
parent 8f605ee309
commit 86ae693f20
94 changed files with 1117 additions and 1083 deletions
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -12,7 +12,9 @@ def parse_args():
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(
-        model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
+        model="jason9693/Qwen2.5-1.5B-apeach",
+        runner="pooling",
+        enforce_eager=True,
    )
    return parser.parse_args()

@@ -27,7 +29,7 @@ def main(args: Namespace):
    ]

    # Create an LLM.
-    # You should pass task="classify" for classification models
+    # You should pass runner="pooling" for classification models
    llm = LLM(**vars(args))

    # Generate logits. The output is a list of ClassificationRequestOutputs.
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -13,7 +13,7 @@ def parse_args():
    # Set example specific arguments
    parser.set_defaults(
        model="intfloat/e5-mistral-7b-instruct",
-        task="embed",
+        runner="pooling",
        enforce_eager=True,
        max_model_len=1024,
    )
@@ -30,7 +30,7 @@ def main(args: Namespace):
    ]

    # Create an LLM.
-    # You should pass task="embed" for embedding models
+    # You should pass runner="pooling" for embedding models
    llm = LLM(**vars(args))

    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -12,7 +12,9 @@ def parse_args():
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(
-        model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
+        model="BAAI/bge-reranker-v2-m3",
+        runner="pooling",
+        enforce_eager=True,
    )
    return parser.parse_args()

@@ -26,7 +28,7 @@ def main(args: Namespace):
    ]

    # Create an LLM.
-    # You should pass task="score" for cross-encoder models
+    # You should pass runner="pooling" for cross-encoder models
    llm = LLM(**vars(args))

    # Generate scores. The output is a list of ScoringRequestOutputs.
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -12,7 +12,9 @@ def parse_args():
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(
-        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+        model="jinaai/jina-embeddings-v3",
+        runner="pooling",
+        trust_remote_code=True,
    )
    return parser.parse_args()

@@ -29,7 +31,7 @@ def main(args: Namespace):
    ]

    # Create an LLM.
-    # You should pass task="embed" for embedding models
+    # You should pass runner="pooling" for embedding models
    llm = LLM(**vars(args))

    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -12,7 +12,9 @@ def parse_args():
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(
-        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+        model="jinaai/jina-embeddings-v3",
+        runner="pooling",
+        trust_remote_code=True,
    )
    return parser.parse_args()

@@ -29,7 +31,7 @@ def main(args: Namespace):
    ]

    # Create an LLM.
-    # You should pass task="embed" for embedding models
+    # You should pass runner="pooling" for embedding models
    llm = LLM(**vars(args))

    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
--- a/examples/offline_inference/qwen3_reranker.py
+++ b/examples/offline_inference/qwen3_reranker.py
@@ -17,7 +17,7 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
 # Models converted offline using this method can not only be more efficient
 # and support the vllm score API, but also make the init parameters more
 # concise, for example.
-# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")

 # If you want to load the official original version, the init parameters are
 # as follows.
@@ -27,7 +27,7 @@ def get_llm() -> LLM:
    """Initializes and returns the LLM model for Qwen3-Reranker."""
    return LLM(
        model=model_name,
-        task="score",
+        runner="pooling",
        hf_overrides={
            "architectures": ["Qwen3ForSequenceClassification"],
            "classifier_from_token": ["no", "yes"],
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@@ -70,7 +70,7 @@ def run_e5_v(query: Query) -> ModelRequestData:

    engine_args = EngineArgs(
        model="royokong/e5-v",
-        task="embed",
+        runner="pooling",
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
    )
@@ -102,7 +102,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:

    engine_args = EngineArgs(
        model="TIGER-Lab/VLM2Vec-Full",
-        task="embed",
+        runner="pooling",
        max_model_len=4096,
        trust_remote_code=True,
        mm_processor_kwargs={"num_crops": 4},
@@ -122,7 +122,7 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:

    engine_args = EngineArgs(
        model="jinaai/jina-reranker-m0",
-        task="score",
+        runner="pooling",
        max_model_len=32768,
        trust_remote_code=True,
        mm_processor_kwargs={
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -9,7 +9,7 @@ Launch the vLLM server with the following command:
 vllm serve llava-hf/llava-1.5-7b-hf

 (multi-image inference with Phi-3.5-vision-instruct)
-vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'

 (audio inference with Ultravox)
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -92,7 +92,7 @@ def dse_qwen2_vl(inp: dict):
 def parse_args():
    parser = argparse.ArgumentParser(
        "Script to call a specified VLM through the API. Make sure to serve "
-        "the model with --task embed before running this."
+        "the model with `--runner pooling` before running this."
    )
    parser.add_argument(
        "--model",
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -3,7 +3,7 @@
 """
 Example online usage of Score API.

-Run `vllm serve <model> --task score` to start up the server in vLLM.
+Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
 """

 import argparse
--- a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
+++ b/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
@@ -3,7 +3,7 @@
 """
 Example online usage of Score API.

-Run `vllm serve <model> --task score` to start up the server in vLLM.
+Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
 """

 import argparse
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -3,7 +3,7 @@
 """
 Example online usage of Pooling API.

-Run `vllm serve <model> --task <embed|classify|reward|score>`
+Run `vllm serve <model> --runner pooling`
 to start up the server in vLLM.
 """

--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -10,7 +10,7 @@ This script demonstrates how to:

 Run the vLLM server first:
 vllm serve meta-llama/Llama-3.2-1B-Instruct \
-  --task generate \
+  --runner generate \
  --max-model-len 4096 \
  --enable-prompt-embeds