"Dockerfile.rocm_base" did not exist on "ecf67814f1a9e31e9802d93e8bd8b11a1c2810e7"
ray_serve_deepseek.py 1.56 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
"""
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
5
6
7
See more details at:
https://docs.ray.io/en/latest/serve/tutorials/serve-deepseek.html
And see Ray Serve LLM documentation at:
8
9
10
11
12
13
https://docs.ray.io/en/latest/serve/llm/serving-llms.html

Run `python3 ray_serve_deepseek.py` to deploy the model.
"""

from ray import serve
14
from ray.serve.llm import LLMConfig, build_openai_app
15
16

llm_config = LLMConfig(
17
18
19
20
21
22
23
24
25
26
27
28
29
    model_loading_config={
        "model_id": "deepseek",
        # Since DeepSeek model is huge, it is recommended to pre-download
        # the model to local disk, say /path/to/the/model and specify:
        # model_source="/path/to/the/model"
        "model_source": "deepseek-ai/DeepSeek-R1",
    },
    deployment_config={
        "autoscaling_config": {
            "min_replicas": 1,
            "max_replicas": 1,
        }
    },
30
31
    # Change to the accelerator type of the node
    accelerator_type="H100",
32
    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
33
    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
34
35
36
37
38
39
40
41
42
43
44
    engine_kwargs={
        "tensor_parallel_size": 8,
        "pipeline_parallel_size": 2,
        "gpu_memory_utilization": 0.92,
        "dtype": "auto",
        "max_num_seqs": 40,
        "max_model_len": 16384,
        "enable_chunked_prefill": True,
        "enable_prefix_caching": True,
        "trust_remote_code": True,
    },
45
46
47
)

# Deploy the application
48
llm_app = build_openai_app({"llm_configs": [llm_config]})
49
serve.run(llm_app)