server.rs 3.93 KB
Newer Older
Olivier Dehaene's avatar
Olivier Dehaene committed
1
2
use bloom_inference_client::ShardedClient;
use crate::{Batcher, Validation};
Olivier Dehaene's avatar
Olivier Dehaene committed
3
use axum::extract::Extension;
Olivier Dehaene's avatar
Olivier Dehaene committed
4
use axum::http::StatusCode;
Olivier Dehaene's avatar
Olivier Dehaene committed
5
use axum::routing::{get, post};
Olivier Dehaene's avatar
Olivier Dehaene committed
6
use axum::{Json, Router};
Olivier Dehaene's avatar
Olivier Dehaene committed
7
use serde::Deserialize;
Olivier Dehaene's avatar
Olivier Dehaene committed
8
use std::net::SocketAddr;
Olivier Dehaene's avatar
Olivier Dehaene committed
9
use tokenizers::Tokenizer;
Olivier Dehaene's avatar
Olivier Dehaene committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
use tokio::time::Instant;
use tracing::instrument;

#[derive(Clone, Debug, Deserialize)]
pub(crate) struct GenerateParameters {
    #[serde(default = "default_temperature")]
    pub temperature: f32,
    #[serde(default = "default_top_k")]
    pub top_k: u32,
    #[serde(default = "default_top_p")]
    pub top_p: f32,
    #[serde(default = "default_do_sample")]
    pub do_sample: bool,
    #[serde(default = "default_max_new_tokens")]
    pub max_new_tokens: u32,
}

fn default_temperature() -> f32 {
    1.0
}

fn default_top_k() -> u32 {
    0
}

fn default_top_p() -> f32 {
    1.0
}

fn default_do_sample() -> bool {
    false
}

fn default_max_new_tokens() -> u32 {
    20
}

fn default_parameters() -> GenerateParameters {
    GenerateParameters {
        temperature: default_temperature(),
        top_k: default_top_k(),
        top_p: default_top_p(),
        do_sample: default_do_sample(),
        max_new_tokens: default_max_new_tokens(),
    }
}

#[derive(Clone, Debug, Deserialize)]
pub(crate) struct GenerateRequest {
    pub inputs: String,
    #[serde(default = "default_parameters")]
    pub parameters: GenerateParameters,
}

Olivier Dehaene's avatar
Olivier Dehaene committed
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#[instrument(skip(state), fields(time, time_per_token))]
async fn liveness(state: Extension<ServerState>) -> Result<(), StatusCode> {
    let output = state
        .infer
        .infer(
            1,
            GenerateRequest {
                inputs: "liveness".to_string(),
                parameters: GenerateParameters {
                    temperature: 1.0,
                    top_k: 0,
                    top_p: 1.0,
                    do_sample: false,
                    max_new_tokens: 1,
                },
            },
        )
        .await;

    match output {
        Ok(_) => Ok(()),
        Err(_) => Err(StatusCode::INTERNAL_SERVER_ERROR),
    }
}

Olivier Dehaene's avatar
Olivier Dehaene committed
89
#[instrument(skip(state), fields(time, time_per_token))]
Olivier Dehaene's avatar
Olivier Dehaene committed
90
async fn generate(
Olivier Dehaene's avatar
Olivier Dehaene committed
91
    state: Extension<ServerState>,
Olivier Dehaene's avatar
Olivier Dehaene committed
92
    req: Json<GenerateRequest>,
Olivier Dehaene's avatar
Olivier Dehaene committed
93
) -> Result<Json<serde_json::Value>, StatusCode> {
Olivier Dehaene's avatar
Olivier Dehaene committed
94
95
    let start = Instant::now();

Olivier Dehaene's avatar
Olivier Dehaene committed
96
97
    let (input_length, validated_request) = match state
        .validation
Olivier Dehaene's avatar
Olivier Dehaene committed
98
        .validate(GenerateRequest {
Olivier Dehaene's avatar
Olivier Dehaene committed
99
100
101
            inputs: req.inputs.clone(),
            parameters: req.parameters.clone(),
        })
Olivier Dehaene's avatar
Olivier Dehaene committed
102
103
        .await
    {
104
        Ok(result) => result,
Olivier Dehaene's avatar
Olivier Dehaene committed
105
        Err(_) => return Err(StatusCode::INTERNAL_SERVER_ERROR),
106
    };
Olivier Dehaene's avatar
Olivier Dehaene committed
107

Olivier Dehaene's avatar
Olivier Dehaene committed
108
    let output = state.infer.infer(input_length, validated_request).await;
Olivier Dehaene's avatar
Olivier Dehaene committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122

    match output {
        Ok(generated_text) => {
            tracing::Span::current().record("time", format!("{:?}", start.elapsed()));
            tracing::Span::current().record(
                "time_per_token",
                format!("{:?}", start.elapsed() / req.parameters.max_new_tokens),
            );
            tracing::info!("response: {}", generated_text);

            Ok(Json(serde_json::json!({
                "generated_text": generated_text,
            })))
        }
Olivier Dehaene's avatar
Olivier Dehaene committed
123
        Err(_) => Err(StatusCode::INTERNAL_SERVER_ERROR),
Olivier Dehaene's avatar
Olivier Dehaene committed
124
125
126
    }
}

Olivier Dehaene's avatar
Olivier Dehaene committed
127
128
129
130
131
132
#[derive(Clone)]
struct ServerState {
    validation: Validation,
    infer: Batcher,
}

Olivier Dehaene's avatar
Olivier Dehaene committed
133
pub async fn run(client: ShardedClient, tokenizer: Tokenizer, addr: SocketAddr) {
Olivier Dehaene's avatar
Olivier Dehaene committed
134
    client.clear_cache().await.expect("Unable to clear cache");
Olivier Dehaene's avatar
Olivier Dehaene committed
135
136
137
138
    tracing::info!("Connected");

    let infer = Batcher::new(client);

Olivier Dehaene's avatar
Olivier Dehaene committed
139
140
    let validation = Validation::new(tokenizer);

Olivier Dehaene's avatar
Olivier Dehaene committed
141
    let shared_state = ServerState { validation, infer };
Olivier Dehaene's avatar
Olivier Dehaene committed
142

Olivier Dehaene's avatar
Olivier Dehaene committed
143
144
145
    let app = Router::new()
        .route("/generate", post(generate))
        .layer(Extension(shared_state.clone()))
Olivier Dehaene's avatar
Olivier Dehaene committed
146
        .route("/health", get(liveness))
Olivier Dehaene's avatar
Olivier Dehaene committed
147
        .layer(Extension(shared_state.clone()));
Olivier Dehaene's avatar
Olivier Dehaene committed
148

Olivier Dehaene's avatar
Olivier Dehaene committed
149
    axum::Server::bind(&addr)
Olivier Dehaene's avatar
Olivier Dehaene committed
150
151
152
        .serve(app.into_make_service())
        .await
        .unwrap();
Olivier Dehaene's avatar
Olivier Dehaene committed
153
}