[Docs] Improve documentation for Deepseek R1 on Ray Serve LLM (#20601)

Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
This commit is contained in:
Ricardo Decal
2025-07-08 02:09:06 -07:00
committed by GitHub
parent 71d1d75b7a
commit b91cb3fa5c

View File

@@ -1,13 +1,21 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM. Deploy DeepSeek R1 or V3 with Ray Serve LLM.
See more details at:
https://docs.ray.io/en/latest/serve/tutorials/serve-deepseek.html
And see Ray Serve LLM documentation at:
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
Run `python3 ray_serve_deepseek.py` to deploy the model. Ray Serve LLM is a scalable and production-grade model serving library built
on the Ray distributed computing framework and first-class support for the vLLM engine.
Key features:
- Automatic scaling, back-pressure, and load balancing across a Ray cluster.
- Unified multi-node multi-model deployment.
- Exposes an OpenAI-compatible HTTP API.
- Multi-LoRA support with shared base models.
Run `python3 ray_serve_deepseek.py` to launch an endpoint.
Learn more in the official Ray Serve LLM documentation:
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
""" """
from ray import serve from ray import serve
@@ -16,9 +24,8 @@ from ray.serve.llm import LLMConfig, build_openai_app
llm_config = LLMConfig( llm_config = LLMConfig(
model_loading_config={ model_loading_config={
"model_id": "deepseek", "model_id": "deepseek",
# Since DeepSeek model is huge, it is recommended to pre-download # Pre-downloading the model to local storage is recommended since
# the model to local disk, say /path/to/the/model and specify: # the model is large. Set model_source="/path/to/the/model".
# model_source="/path/to/the/model"
"model_source": "deepseek-ai/DeepSeek-R1", "model_source": "deepseek-ai/DeepSeek-R1",
}, },
deployment_config={ deployment_config={
@@ -27,10 +34,10 @@ llm_config = LLMConfig(
"max_replicas": 1, "max_replicas": 1,
} }
}, },
# Change to the accelerator type of the node # Set to the node's accelerator type.
accelerator_type="H100", accelerator_type="H100",
runtime_env={"env_vars": {"VLLM_USE_V1": "1"}}, runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
# Customize engine arguments as needed (e.g. vLLM engine kwargs) # Customize engine arguments as required (for example, vLLM engine kwargs).
engine_kwargs={ engine_kwargs={
"tensor_parallel_size": 8, "tensor_parallel_size": 8,
"pipeline_parallel_size": 2, "pipeline_parallel_size": 2,
@@ -44,6 +51,6 @@ llm_config = LLMConfig(
}, },
) )
# Deploy the application # Deploy the application.
llm_app = build_openai_app({"llm_configs": [llm_config]}) llm_app = build_openai_app({"llm_configs": [llm_config]})
serve.run(llm_app) serve.run(llm_app)