llm_backend.rs 6.06 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use serde::{Deserialize, Serialize};

18
19
pub use super::preprocessor::PreprocessedRequest;
pub use super::FinishReason;
20
use crate::protocols::TokenIdType;
21
use dynamo_runtime::protocols::maybe_error::MaybeError;
22
23
24
25

pub type TokenType = Option<String>;
pub type LogProbs = Vec<f64>;

Greg Clark's avatar
Greg Clark committed
26
27
28
29
30
31
32
33
34
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct TopLogprob {
    pub rank: u32,
    pub token_id: TokenIdType,
    pub token: TokenType,
    pub logprob: f64,
}
pub type TopLogprobs = Vec<Vec<TopLogprob>>; // num_tokens x top_logprobs

35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct BackendOutput {
    /// New token_ids generated from the LLM Engine
    pub token_ids: Vec<TokenIdType>,

    /// Unlike [`LLMEngineOutput::tokens`], this is a vector of tokens, not an optional.
    /// The size of this vector should be the same as the size of `token_ids`.
    pub tokens: Vec<TokenType>,

    /// Decoded text from the list tokens.
    pub text: Option<String>,

    /// Optional cumulative log probabilities
    pub cum_log_probs: Option<f64>,

    /// Optional log probabilities
    pub log_probs: Option<LogProbs>,

Greg Clark's avatar
Greg Clark committed
53
54
    pub top_logprobs: Option<TopLogprobs>,

55
56
57
    // TODO: Enrich this with more information as can apply our first-level postprocessing
    // logic and return more detailed information
    pub finish_reason: Option<FinishReason>,
58
59
    // Model Deployment Card checksum
    //pub mdcsum: String,
60
61
62

    // Index field for batch requests to match OpenAI format
    pub index: Option<u32>,
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
}

/// The LLM engine and backnd with manage it's own state, specifically translating how a
/// given request/slot is managed on that particular backend.
///
/// For nvLLM's purpose, it has a single tracable request_id as part of it's context that
/// has propaged through the service pipeline to the backend.
///
/// This is the minimal raw output from the LLM engine. The Backend may then apply multiple
/// levels of post-processing before the BackendOutput is returns
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct LLMEngineOutput {
    // new token_ids
    pub token_ids: Vec<TokenIdType>,

    /// If the LLM Engine performs the detokenization, then this will have a Some of the detokenized
    /// text/tokens. If this value is None, then the Backend is responsible for detokenization.
    pub tokens: Option<Vec<TokenType>>,

    // decoded text -
    pub text: Option<String>,

    /// cumulative log probabilities
    pub cum_log_probs: Option<f64>,

    /// Optional log probabilities
    pub log_probs: Option<LogProbs>,

Greg Clark's avatar
Greg Clark committed
91
92
    pub top_logprobs: Option<TopLogprobs>,

93
94
95
    // TODO: Enrich this with more information as can apply our first-level postprocessing
    // logic and return more detailed information
    pub finish_reason: Option<FinishReason>,
96
97
98

    // Index field for batch requests to match OpenAI format
    pub index: Option<u32>,
99
100
101
102
103
104
105
106
107
108
}

impl LLMEngineOutput {
    pub fn cancelled() -> Self {
        LLMEngineOutput {
            token_ids: vec![],
            tokens: None,
            text: None,
            cum_log_probs: None,
            log_probs: None,
Greg Clark's avatar
Greg Clark committed
109
            top_logprobs: None,
110
            finish_reason: Some(FinishReason::Cancelled),
111
            index: None,
112
113
114
115
116
117
118
119
120
121
122
        }
    }

    pub fn stop() -> Self {
        LLMEngineOutput {
            token_ids: vec![],
            tokens: None,
            text: None,
            cum_log_probs: None,
            log_probs: None,
            finish_reason: Some(FinishReason::Stop),
Greg Clark's avatar
Greg Clark committed
123
            top_logprobs: None,
124
            index: None,
125
126
127
128
129
130
131
132
133
134
        }
    }

    pub fn length() -> Self {
        LLMEngineOutput {
            token_ids: vec![],
            tokens: None,
            text: None,
            cum_log_probs: None,
            log_probs: None,
Greg Clark's avatar
Greg Clark committed
135
            top_logprobs: None,
136
            finish_reason: Some(FinishReason::Length),
137
            index: None,
138
139
140
141
142
143
144
145
146
147
        }
    }

    pub fn error(err_msg: String) -> Self {
        LLMEngineOutput {
            token_ids: vec![],
            tokens: None,
            text: None,
            cum_log_probs: None,
            log_probs: None,
Greg Clark's avatar
Greg Clark committed
148
            top_logprobs: None,
149
            finish_reason: Some(FinishReason::Error(err_msg)),
150
            index: None,
151
152
153
        }
    }
}
154

155
impl MaybeError for LLMEngineOutput {
156
    fn from_err(err: Box<dyn std::error::Error + Send + Sync>) -> Self {
157
158
159
        LLMEngineOutput::error(format!("{:?}", err))
    }

160
    fn err(&self) -> Option<Box<dyn std::error::Error + Send + Sync>> {
161
162
163
164
165
166
167
168
        if let Some(FinishReason::Error(err_msg)) = &self.finish_reason {
            Some(anyhow::Error::msg(err_msg.clone()).into())
        } else {
            None
        }
    }
}

169
170
171
172
173
174
175
176
177
178
/// Raw output from embedding engines containing embedding vectors
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct EmbeddingsEngineOutput {
    /// Generated embedding vectors (one per input text)
    pub embeddings: Vec<Vec<f64>>,

    /// Token usage information
    pub prompt_tokens: u32,
    pub total_tokens: u32,
}
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_maybe_error() {
        let output = LLMEngineOutput::stop();
        assert!(output.err().is_none());
        assert!(output.is_ok());
        assert!(!output.is_err());

        let output = LLMEngineOutput::error("Test error".to_string());
        assert_eq!(format!("{}", output.err().unwrap()), "Test error");
        assert!(!output.is_ok());
        assert!(output.is_err());
    }
}