[Misc] Add example to run DeepSeek with Ray Serve LLM (#17134)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>

[Misc] Add example to run DeepSeek with Ray Serve LLM (#17134)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
583e9009 · Rui Qiao · GitHub · 05e1fbfc · 583e9009
Unverified Commit 583e9009 authored Apr 24, 2025 by Rui Qiao Committed by GitHub Apr 24, 2025
Show whitespace changes
Inline Side-by-side

Showing with 44 additions and 0 deletions

examples/online_serving/ray_serve_deepseek.py examples/online_serving/ray_serve_deepseek.py +44 -0

No files found.
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
+See Ray Serve LLM documentation at:
+https://docs.ray.io/en/latest/serve/llm/serving-llms.html
+
+Run `python3 ray_serve_deepseek.py` to deploy the model.
+"""
+
+from ray import serve
+from ray.serve.llm import LLMConfig, LLMRouter, LLMServer
+
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="deepseek",
+        # Change to model download path
+        model_source="/path/to/the/model",
+    ),
+    deployment_config=dict(autoscaling_config=dict(
+        min_replicas=1,
+        max_replicas=1,
+    )),
+    # Change to the accelerator type of the node
+    accelerator_type="H100",
+    runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")),
+    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
+    engine_kwargs=dict(
+        tensor_parallel_size=8,
+        pipeline_parallel_size=2,
+        gpu_memory_utilization=0.92,
+        dtype="auto",
+        max_num_seqs=40,
+        max_model_len=16384,
+        enable_chunked_prefill=True,
+        enable_prefix_caching=True,
+        trust_remote_code=True,
+    ),
+)
+
+# Deploy the application
+deployment = LLMServer.as_deployment(
+    llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config)
+llm_app = LLMRouter.as_deployment().bind([deployment])
+serve.run(llm_app)