kserve_grpc.rs 5.57 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
// SPDX-License-Identifier: Apache-2.0

4
use std::sync::{Arc, OnceLock};
5

6
7
8
use dynamo_llm::{self as llm_rs};
use llm_rs::model_card::ModelDeploymentCard as RsModelDeploymentCard;
use llm_rs::model_type::{ModelInput, ModelType};
9
10
use pyo3::prelude::*;

11
12
13
14
use crate::{
    CancellationToken, DistributedRuntime, engine::*, llm::local_model::ModelRuntimeConfig,
    to_pyerr,
};
15
16
17
18
19
20

pub use dynamo_llm::grpc::service::kserve;

#[pyclass]
pub struct KserveGrpcService {
    inner: kserve::KserveService,
21
22
    // CancellationToken is already Send + Sync + Clone, no Mutex needed
    cancel_token: Arc<OnceLock<CancellationToken>>,
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
}

#[pymethods]
impl KserveGrpcService {
    #[new]
    #[pyo3(signature = (port=None, host=None))]
    pub fn new(port: Option<u16>, host: Option<String>) -> PyResult<Self> {
        let mut builder = kserve::KserveService::builder();
        if let Some(port) = port {
            builder = builder.port(port);
        }
        if let Some(host) = host {
            builder = builder.host(host);
        }
        let inner = builder.build().map_err(to_pyerr)?;
38
39
40
41
        Ok(Self {
            inner,
            cancel_token: Arc::new(OnceLock::new()),
        })
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
    }

    pub fn add_completions_model(
        &self,
        model: String,
        checksum: String,
        engine: PythonAsyncEngine,
    ) -> PyResult<()> {
        let engine = Arc::new(engine);
        self.inner
            .model_manager()
            .add_completions_model(&model, &checksum, engine)
            .map_err(to_pyerr)
    }

    pub fn add_chat_completions_model(
        &self,
        model: String,
        checksum: String,
        engine: PythonAsyncEngine,
    ) -> PyResult<()> {
        let engine = Arc::new(engine);
        self.inner
            .model_manager()
            .add_chat_completions_model(&model, &checksum, engine)
            .map_err(to_pyerr)
    }

70
    #[pyo3(signature = (model, checksum, engine, runtime_config=None))]
71
72
73
74
75
    pub fn add_tensor_model(
        &self,
        model: String,
        checksum: String,
        engine: PythonAsyncEngine,
76
        runtime_config: Option<ModelRuntimeConfig>,
77
    ) -> PyResult<()> {
78
79
80
81
82
83
84
85
86
87
88
89
90
91
        // If runtime_config is provided, create and save a ModelDeploymentCard
        // so the ModelConfig endpoint can return model configuration
        if let Some(runtime_config) = runtime_config {
            let mut card = RsModelDeploymentCard::with_name_only(&model);
            card.model_type = ModelType::TensorBased;
            card.model_input = ModelInput::Tensor;
            card.runtime_config = runtime_config.inner;

            self.inner
                .model_manager()
                .save_model_card(&model, card)
                .map_err(to_pyerr)?;
        }

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
        let engine = Arc::new(engine);
        self.inner
            .model_manager()
            .add_tensor_model(&model, &checksum, engine)
            .map_err(to_pyerr)
    }

    pub fn remove_completions_model(&self, model: String) -> PyResult<()> {
        self.inner
            .model_manager()
            .remove_completions_model(&model)
            .map_err(to_pyerr)
    }

    pub fn remove_chat_completions_model(&self, model: String) -> PyResult<()> {
        self.inner
            .model_manager()
            .remove_chat_completions_model(&model)
            .map_err(to_pyerr)
    }

    pub fn remove_tensor_model(&self, model: String) -> PyResult<()> {
114
        // Remove the engine
115
116
117
        self.inner
            .model_manager()
            .remove_tensor_model(&model)
118
119
120
121
122
123
124
            .map_err(to_pyerr)?;

        // Also remove the model card if it exists
        // (It's ok if it doesn't exist since runtime_config is optional, we just ignore the None return)
        let _ = self.inner.model_manager().remove_model_card(&model);

        Ok(())
125
126
127
128
129
130
131
132
133
134
135
136
137
138
    }

    pub fn list_chat_completions_models(&self) -> PyResult<Vec<String>> {
        Ok(self.inner.model_manager().list_chat_completions_models())
    }

    pub fn list_completions_models(&self) -> PyResult<Vec<String>> {
        Ok(self.inner.model_manager().list_completions_models())
    }

    pub fn list_tensor_models(&self) -> PyResult<Vec<String>> {
        Ok(self.inner.model_manager().list_tensor_models())
    }

139
140
141
142
143
144
145
146
    fn run<'p>(&self, py: Python<'p>, runtime: &DistributedRuntime) -> PyResult<Bound<'p, PyAny>> {
        // Check if run() was already called to avoid creating unnecessary token
        if self.cancel_token.get().is_some() {
            return Err(PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(
                "KserveGrpcService.run() has already been called on this instance",
            ));
        }

147
        let service = self.inner.clone();
148
149
150
151
152
153
154
155
156
157
158
159
160
161
        // Only create token if we passed the check above
        let token = runtime.inner().child_token();

        // Store the token for shutdown - should always succeed after the check above
        self.cancel_token
            .set(CancellationToken {
                inner: token.clone(),
            })
            .map_err(|_| {
                PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(
                    "Race condition detected in KserveGrpcService.run()",
                )
            })?;

162
        pyo3_async_runtimes::tokio::future_into_py(py, async move {
163
            service.run(token).await.map_err(to_pyerr)?;
164
165
166
            Ok(())
        })
    }
167
168
169
170
171
172
173

    fn shutdown(&self) {
        // CancellationToken.cancel() is thread-safe, no lock needed
        if let Some(token) = self.cancel_token.get() {
            token.inner.cancel();
        }
    }
174
}