[Docs] Improve documentation for Deepseek R1 on Ray Serve LLM (#20601)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
This commit is contained in:
@@ -1,13 +1,21 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
|
Deploy DeepSeek R1 or V3 with Ray Serve LLM.
|
||||||
See more details at:
|
|
||||||
https://docs.ray.io/en/latest/serve/tutorials/serve-deepseek.html
|
|
||||||
And see Ray Serve LLM documentation at:
|
|
||||||
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
|
|
||||||
|
|
||||||
Run `python3 ray_serve_deepseek.py` to deploy the model.
|
Ray Serve LLM is a scalable and production-grade model serving library built
|
||||||
|
on the Ray distributed computing framework and first-class support for the vLLM engine.
|
||||||
|
|
||||||
|
Key features:
|
||||||
|
- Automatic scaling, back-pressure, and load balancing across a Ray cluster.
|
||||||
|
- Unified multi-node multi-model deployment.
|
||||||
|
- Exposes an OpenAI-compatible HTTP API.
|
||||||
|
- Multi-LoRA support with shared base models.
|
||||||
|
|
||||||
|
Run `python3 ray_serve_deepseek.py` to launch an endpoint.
|
||||||
|
|
||||||
|
Learn more in the official Ray Serve LLM documentation:
|
||||||
|
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from ray import serve
|
from ray import serve
|
||||||
@@ -16,9 +24,8 @@ from ray.serve.llm import LLMConfig, build_openai_app
|
|||||||
llm_config = LLMConfig(
|
llm_config = LLMConfig(
|
||||||
model_loading_config={
|
model_loading_config={
|
||||||
"model_id": "deepseek",
|
"model_id": "deepseek",
|
||||||
# Since DeepSeek model is huge, it is recommended to pre-download
|
# Pre-downloading the model to local storage is recommended since
|
||||||
# the model to local disk, say /path/to/the/model and specify:
|
# the model is large. Set model_source="/path/to/the/model".
|
||||||
# model_source="/path/to/the/model"
|
|
||||||
"model_source": "deepseek-ai/DeepSeek-R1",
|
"model_source": "deepseek-ai/DeepSeek-R1",
|
||||||
},
|
},
|
||||||
deployment_config={
|
deployment_config={
|
||||||
@@ -27,10 +34,10 @@ llm_config = LLMConfig(
|
|||||||
"max_replicas": 1,
|
"max_replicas": 1,
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
# Change to the accelerator type of the node
|
# Set to the node's accelerator type.
|
||||||
accelerator_type="H100",
|
accelerator_type="H100",
|
||||||
runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
|
runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
|
||||||
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
|
# Customize engine arguments as required (for example, vLLM engine kwargs).
|
||||||
engine_kwargs={
|
engine_kwargs={
|
||||||
"tensor_parallel_size": 8,
|
"tensor_parallel_size": 8,
|
||||||
"pipeline_parallel_size": 2,
|
"pipeline_parallel_size": 2,
|
||||||
@@ -44,6 +51,6 @@ llm_config = LLMConfig(
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Deploy the application
|
# Deploy the application.
|
||||||
llm_app = build_openai_app({"llm_configs": [llm_config]})
|
llm_app = build_openai_app({"llm_configs": [llm_config]})
|
||||||
serve.run(llm_app)
|
serve.run(llm_app)
|
||||||
|
|||||||
Reference in New Issue
Block a user