main.rs 6.93 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::env;

use clap::Parser;

Neelay Shah's avatar
Neelay Shah committed
20
21
use dynamo_run::{Input, Output};
use dynamo_runtime::logging;
22
23

const HELP: &str = r#"
Neelay Shah's avatar
Neelay Shah committed
24
dynamo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynamo locally.
25
26
27

Example:
- cargo build --release --features mistralrs,cuda
28
- cd target/release
Neelay Shah's avatar
Neelay Shah committed
29
30
- ./dynamo-run hf_checkouts/Llama-3.2-3B-Instruct/
- OR: ./dynamo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf
31

32
33
"#;

34
35
36
37
38
39
40
41
const DEFAULT_IN: Input = Input::Text;

#[cfg(feature = "mistralrs")]
const DEFAULT_OUT: Output = Output::MistralRs;

#[cfg(not(feature = "mistralrs"))]
const DEFAULT_OUT: Output = Output::EchoFull;

42
const ZMQ_SOCKET_PREFIX: &str = "dyn";
43

Neelay Shah's avatar
Neelay Shah committed
44
const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core|pystr:<engine.py>|pytok:<engine.py>] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0]";
45
46
47
48

fn main() -> anyhow::Result<()> {
    logging::init();

49
50
    // Call sub-processes before starting the Runtime machinery
    // For anything except sub-process starting try_parse_from will error.
Neelay Shah's avatar
Neelay Shah committed
51
    if let Ok(flags) = dynamo_run::Flags::try_parse_from(env::args()) {
52
53
54
55
56
57
58
59
60
61
62
        #[allow(unused_variables)]
        if let Some(sglang_flags) = flags.internal_sglang_process {
            let Some(model_path) = flags.model_path_flag.as_ref() else {
                anyhow::bail!("sglang subprocess requires --model-path");
            };
            if !model_path.is_dir() {
                anyhow::bail!("sglang subprocess requires model path to be a directory containing the safetensors files");
            }
            if cfg!(feature = "sglang") {
                #[cfg(feature = "sglang")]
                {
Neelay Shah's avatar
Neelay Shah committed
63
                    use dynamo_llm::engines::sglang;
64
65
66
67
68
                    let gpu_config = sglang::MultiGPUConfig {
                        tp_size: flags.tensor_parallel_size,
                        tp_rank: sglang_flags.tp_rank,
                        gpu_id: sglang_flags.gpu_id,
                    };
Neelay Shah's avatar
Neelay Shah committed
69
                    let node_config = dynamo_llm::engines::MultiNodeConfig {
70
71
                        num_nodes: flags.num_nodes,
                        node_rank: flags.node_rank,
72
                        leader_addr: flags.leader_addr.unwrap_or_default(),
73
74
75
76
77
78
79
80
81
82
83
84
85
                    };
                    return sglang::run_subprocess(
                        ZMQ_SOCKET_PREFIX,
                        model_path,
                        sglang_flags.pipe_fd as std::os::fd::RawFd,
                        node_config,
                        gpu_config,
                    );
                }
            } else {
                panic!("Rebuild with --features=sglang");
            }
        }
Graham King's avatar
Graham King committed
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

        #[allow(unused_variables)]
        if flags.internal_vllm_process {
            let Some(model_path) = flags.model_path_flag else {
                anyhow::bail!("vllm subprocess requires --model-path flag");
            };
            let Some(model_config) = flags.model_config else {
                anyhow::bail!("vllm subprocess requires --model-config");
            };
            if !model_config.is_dir() {
                anyhow::bail!("vllm subprocess requires model config path to be a directory containing tokenizer.json, config.json, etc");
            }
            if cfg!(feature = "vllm") {
                #[cfg(feature = "vllm")]
                {
Neelay Shah's avatar
Neelay Shah committed
101
102
                    use dynamo_llm::engines::vllm;
                    let node_config = dynamo_llm::engines::MultiNodeConfig {
103
104
105
106
107
108
109
110
111
112
113
                        num_nodes: flags.num_nodes,
                        node_rank: flags.node_rank,
                        leader_addr: flags.leader_addr.unwrap_or_default(),
                    };
                    return vllm::run_subprocess(
                        ZMQ_SOCKET_PREFIX,
                        &model_config,
                        &model_path,
                        node_config,
                        flags.tensor_parallel_size,
                    );
Graham King's avatar
Graham King committed
114
115
116
117
118
                }
            } else {
                panic!("Rebuild with --features=vllm");
            }
        }
119
120
    }

121
    // max_worker_threads and max_blocking_threads from env vars or config file.
Neelay Shah's avatar
Neelay Shah committed
122
    let rt_config = dynamo_runtime::RuntimeConfig::from_settings()?;
123
124

    // One per process. Wraps a Runtime with holds two tokio runtimes.
Neelay Shah's avatar
Neelay Shah committed
125
    let worker = dynamo_runtime::Worker::from_config(rt_config)?;
126

127
    worker.execute(wrapper)
128
129
}

Neelay Shah's avatar
Neelay Shah committed
130
async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
131
132
133
134
135
136
137
138
139
140
    let mut in_opt = None;
    let mut out_opt = None;
    let args: Vec<String> = env::args().skip(1).collect();
    if args.is_empty() || args[0] == "-h" || args[0] == "--help" {
        println!("{USAGE}");
        println!("{HELP}");
        return Ok(());
    }
    for arg in env::args().skip(1).take(2) {
        let Some((in_out, val)) = arg.split_once('=') else {
141
142
            // Probably we're defaulting in and/or out, and this is a flag
            continue;
143
144
145
146
147
148
149
150
151
152
153
154
155
        };
        match in_out {
            "in" => {
                in_opt = Some(val.try_into()?);
            }
            "out" => {
                out_opt = Some(val.try_into()?);
            }
            _ => {
                anyhow::bail!("Invalid argument, must start with 'in' or 'out. {USAGE}");
            }
        }
    }
156
157
158
159
160
161
162
163
164
165
166
167
168
169
    let mut non_flag_params = 1; // binary name
    let in_opt = match in_opt {
        Some(x) => {
            non_flag_params += 1;
            x
        }
        None => DEFAULT_IN,
    };
    let out_opt = match out_opt {
        Some(x) => {
            non_flag_params += 1;
            x
        }
        None => DEFAULT_OUT,
170
171
172
    };

    // Clap skips the first argument expecting it to be the binary name, so add it back
173
    // Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag.
Neelay Shah's avatar
Neelay Shah committed
174
175
    let flags = dynamo_run::Flags::try_parse_from(
        ["dynamo-run".to_string()]
176
177
178
            .into_iter()
            .chain(env::args().skip(non_flag_params)),
    )?;
179

Neelay Shah's avatar
Neelay Shah committed
180
    dynamo_run::run(
181
182
183
184
185
186
187
        runtime,
        in_opt,
        out_opt,
        flags,
        Some(ZMQ_SOCKET_PREFIX.to_string()),
    )
    .await
188
}