[V0 Deprecation] Remove VLLM_USE_V1 from docs and scripts (#26336)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -166,7 +166,7 @@ main() {
|
||||
local kv_port=$((21001 + i))
|
||||
|
||||
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
--enforce-eager \
|
||||
--host 0.0.0.0 \
|
||||
--port $port \
|
||||
@@ -194,7 +194,7 @@ main() {
|
||||
local kv_port=$((22001 + i))
|
||||
|
||||
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
--enforce-eager \
|
||||
--host 0.0.0.0 \
|
||||
--port $port \
|
||||
|
||||
@@ -55,7 +55,6 @@ done
|
||||
echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
|
||||
|
||||
export RAY_DEDUP_LOGS=0
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_ALL2ALL_BACKEND="pplx"
|
||||
export VLLM_USE_DEEP_GEMM=1
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ To run this example, you can start the vLLM server
|
||||
without any specific flags:
|
||||
|
||||
```bash
|
||||
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
--structured-outputs-config.backend outlines
|
||||
```
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@ llm_config = LLMConfig(
|
||||
},
|
||||
# Set to the node's accelerator type.
|
||||
accelerator_type="H100",
|
||||
runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
|
||||
# Customize engine arguments as required (for example, vLLM engine kwargs).
|
||||
engine_kwargs={
|
||||
"tensor_parallel_size": 8,
|
||||
|
||||
Reference in New Issue
Block a user