[V0 Deprecation] Remove VLLM_USE_V1 from docs and scripts (#26336)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 16:46:44 +08:00
parent 46b0779996
commit 7e4cd070b0
11 changed files with 17 additions and 26 deletions
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -166,7 +166,7 @@ main() {
        local kv_port=$((21001 + i))

        echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
-        CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
        --enforce-eager \
        --host 0.0.0.0 \
        --port $port \
@@ -194,7 +194,7 @@ main() {
        local kv_port=$((22001 + i))

        echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
-        VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
        --enforce-eager \
        --host 0.0.0.0 \
        --port $port \
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -55,7 +55,6 @@ done
 echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"

 export RAY_DEDUP_LOGS=0
-export VLLM_USE_V1=1
 export VLLM_ALL2ALL_BACKEND="pplx"
 export VLLM_USE_DEEP_GEMM=1

--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -5,7 +5,7 @@ To run this example, you can start the vLLM server
 without any specific flags:

 ```bash
-VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
+vllm serve unsloth/Llama-3.2-1B-Instruct \
    --structured-outputs-config.backend outlines
 ```

--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -36,7 +36,6 @@ llm_config = LLMConfig(
    },
    # Set to the node's accelerator type.
    accelerator_type="H100",
-    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
    # Customize engine arguments as required (for example, vLLM engine kwargs).
    engine_kwargs={
        "tensor_parallel_size": 8,