From b91cb3fa5c40993a1e56ffb6915db9ffebf9aa0a Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 8 Jul 2025 02:09:06 -0700 Subject: [PATCH] [Docs] Improve documentation for Deepseek R1 on Ray Serve LLM (#20601) Signed-off-by: Ricardo Decal --- examples/online_serving/ray_serve_deepseek.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py index 9471563dd..d24b553df 100644 --- a/examples/online_serving/ray_serve_deepseek.py +++ b/examples/online_serving/ray_serve_deepseek.py @@ -1,13 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -Example to deploy DeepSeek R1 or V3 with Ray Serve LLM. -See more details at: -https://docs.ray.io/en/latest/serve/tutorials/serve-deepseek.html -And see Ray Serve LLM documentation at: -https://docs.ray.io/en/latest/serve/llm/serving-llms.html +Deploy DeepSeek R1 or V3 with Ray Serve LLM. -Run `python3 ray_serve_deepseek.py` to deploy the model. +Ray Serve LLM is a scalable and production-grade model serving library built +on the Ray distributed computing framework and first-class support for the vLLM engine. + +Key features: +- Automatic scaling, back-pressure, and load balancing across a Ray cluster. +- Unified multi-node multi-model deployment. +- Exposes an OpenAI-compatible HTTP API. +- Multi-LoRA support with shared base models. + +Run `python3 ray_serve_deepseek.py` to launch an endpoint. + +Learn more in the official Ray Serve LLM documentation: +https://docs.ray.io/en/latest/serve/llm/serving-llms.html """ from ray import serve @@ -16,9 +24,8 @@ from ray.serve.llm import LLMConfig, build_openai_app llm_config = LLMConfig( model_loading_config={ "model_id": "deepseek", - # Since DeepSeek model is huge, it is recommended to pre-download - # the model to local disk, say /path/to/the/model and specify: - # model_source="/path/to/the/model" + # Pre-downloading the model to local storage is recommended since + # the model is large. Set model_source="/path/to/the/model". "model_source": "deepseek-ai/DeepSeek-R1", }, deployment_config={ @@ -27,10 +34,10 @@ llm_config = LLMConfig( "max_replicas": 1, } }, - # Change to the accelerator type of the node + # Set to the node's accelerator type. accelerator_type="H100", runtime_env={"env_vars": {"VLLM_USE_V1": "1"}}, - # Customize engine arguments as needed (e.g. vLLM engine kwargs) + # Customize engine arguments as required (for example, vLLM engine kwargs). engine_kwargs={ "tensor_parallel_size": 8, "pipeline_parallel_size": 2, @@ -44,6 +51,6 @@ llm_config = LLMConfig( }, ) -# Deploy the application +# Deploy the application. llm_app = build_openai_app({"llm_configs": [llm_config]}) serve.run(llm_app)