ray_serve_deepseek.py 1.77 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""
4
5
6
7
8
9
10
11
12
13
Deploy DeepSeek R1 or V3 with Ray Serve LLM.

Ray Serve LLM is a scalable and production-grade model serving library built
on the Ray distributed computing framework and first-class support for the vLLM engine.

Key features:
- Automatic scaling, back-pressure, and load balancing across a Ray cluster.
- Unified multi-node multi-model deployment.
- Exposes an OpenAI-compatible HTTP API.
- Multi-LoRA support with shared base models.
14

15
16
17
18
Run `python3 ray_serve_deepseek.py` to launch an endpoint.

Learn more in the official Ray Serve LLM documentation:
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
19
20
21
"""

from ray import serve
22
from ray.serve.llm import LLMConfig, build_openai_app
23
24

llm_config = LLMConfig(
25
26
    model_loading_config={
        "model_id": "deepseek",
27
28
        # Pre-downloading the model to local storage is recommended since
        # the model is large. Set model_source="/path/to/the/model".
29
30
31
32
33
34
35
36
        "model_source": "deepseek-ai/DeepSeek-R1",
    },
    deployment_config={
        "autoscaling_config": {
            "min_replicas": 1,
            "max_replicas": 1,
        }
    },
37
    # Set to the node's accelerator type.
38
    accelerator_type="H100",
39
    # Customize engine arguments as required (for example, vLLM engine kwargs).
40
41
42
43
44
45
46
47
48
49
50
    engine_kwargs={
        "tensor_parallel_size": 8,
        "pipeline_parallel_size": 2,
        "gpu_memory_utilization": 0.92,
        "dtype": "auto",
        "max_num_seqs": 40,
        "max_model_len": 16384,
        "enable_chunked_prefill": True,
        "enable_prefix_caching": True,
        "trust_remote_code": True,
    },
51
52
)

53
# Deploy the application.
54
llm_app = build_openai_app({"llm_configs": [llm_config]})
55
serve.run(llm_app)