server.rs 3.5 KB
Newer Older
Olivier Dehaene's avatar
Olivier Dehaene committed
1
use crate::{Batcher, Validation};
Olivier Dehaene's avatar
Olivier Dehaene committed
2
use axum::extract::Extension;
Olivier Dehaene's avatar
Olivier Dehaene committed
3
use axum::http::StatusCode;
Olivier Dehaene's avatar
Olivier Dehaene committed
4
use axum::routing::{get, post};
Olivier Dehaene's avatar
Olivier Dehaene committed
5
use axum::{Json, Router};
Olivier Dehaene's avatar
Olivier Dehaene committed
6
use bloom_inference_client::ShardedClient;
Olivier Dehaene's avatar
Olivier Dehaene committed
7
use serde::Deserialize;
Olivier Dehaene's avatar
Olivier Dehaene committed
8
use std::net::SocketAddr;
Olivier Dehaene's avatar
Olivier Dehaene committed
9
use tokenizers::Tokenizer;
Olivier Dehaene's avatar
Olivier Dehaene committed
10
11
12
13
14
15
16
17
use tokio::time::Instant;
use tracing::instrument;

#[derive(Clone, Debug, Deserialize)]
pub(crate) struct GenerateParameters {
    #[serde(default = "default_temperature")]
    pub temperature: f32,
    #[serde(default = "default_top_k")]
Olivier Dehaene's avatar
Olivier Dehaene committed
18
    pub top_k: i32,
Olivier Dehaene's avatar
Olivier Dehaene committed
19
20
21
22
23
24
25
26
27
28
29
30
    #[serde(default = "default_top_p")]
    pub top_p: f32,
    #[serde(default = "default_do_sample")]
    pub do_sample: bool,
    #[serde(default = "default_max_new_tokens")]
    pub max_new_tokens: u32,
}

fn default_temperature() -> f32 {
    1.0
}

Olivier Dehaene's avatar
Olivier Dehaene committed
31
fn default_top_k() -> i32 {
Olivier Dehaene's avatar
Olivier Dehaene committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    0
}

fn default_top_p() -> f32 {
    1.0
}

fn default_do_sample() -> bool {
    false
}

fn default_max_new_tokens() -> u32 {
    20
}

fn default_parameters() -> GenerateParameters {
    GenerateParameters {
        temperature: default_temperature(),
        top_k: default_top_k(),
        top_p: default_top_p(),
        do_sample: default_do_sample(),
        max_new_tokens: default_max_new_tokens(),
    }
}

#[derive(Clone, Debug, Deserialize)]
pub(crate) struct GenerateRequest {
    pub inputs: String,
    #[serde(default = "default_parameters")]
    pub parameters: GenerateParameters,
}

Olivier Dehaene's avatar
Olivier Dehaene committed
64
#[instrument(skip(state), fields(time, time_per_token))]
Olivier Dehaene's avatar
Olivier Dehaene committed
65
66
async fn liveness(state: Extension<ServerState>) -> Result<(), (StatusCode, String)> {
    state
Olivier Dehaene's avatar
Olivier Dehaene committed
67
        .batcher
Olivier Dehaene's avatar
Olivier Dehaene committed
68
69
70
71
72
73
74
75
76
77
78
79
80
        .infer(
            1,
            GenerateRequest {
                inputs: "liveness".to_string(),
                parameters: GenerateParameters {
                    temperature: 1.0,
                    top_k: 0,
                    top_p: 1.0,
                    do_sample: false,
                    max_new_tokens: 1,
                },
            },
        )
Olivier Dehaene's avatar
Olivier Dehaene committed
81
82
        .await?;
    Ok(())
Olivier Dehaene's avatar
Olivier Dehaene committed
83
84
}

Olivier Dehaene's avatar
Olivier Dehaene committed
85
#[instrument(skip(state), fields(time, time_per_token))]
Olivier Dehaene's avatar
Olivier Dehaene committed
86
async fn generate(
Olivier Dehaene's avatar
Olivier Dehaene committed
87
    state: Extension<ServerState>,
Olivier Dehaene's avatar
Olivier Dehaene committed
88
    req: Json<GenerateRequest>,
Olivier Dehaene's avatar
Olivier Dehaene committed
89
) -> Result<Json<serde_json::Value>, (StatusCode, String)> {
Olivier Dehaene's avatar
Olivier Dehaene committed
90
91
    let start = Instant::now();

Olivier Dehaene's avatar
Olivier Dehaene committed
92
    let (input_length, validated_request) = state
Olivier Dehaene's avatar
Olivier Dehaene committed
93
        .validation
Olivier Dehaene's avatar
Olivier Dehaene committed
94
        .validate(GenerateRequest {
Olivier Dehaene's avatar
Olivier Dehaene committed
95
96
97
            inputs: req.inputs.clone(),
            parameters: req.parameters.clone(),
        })
Olivier Dehaene's avatar
Olivier Dehaene committed
98
99
        .await?;

Olivier Dehaene's avatar
Olivier Dehaene committed
100
    let generated_text = state.batcher.infer(input_length, validated_request).await?;
Olivier Dehaene's avatar
Olivier Dehaene committed
101
102
103
104
105
106
107
108
109
110
111

    tracing::Span::current().record("time", format!("{:?}", start.elapsed()));
    tracing::Span::current().record(
        "time_per_token",
        format!("{:?}", start.elapsed() / req.parameters.max_new_tokens),
    );
    tracing::info!("response: {}", generated_text);

    Ok(Json(serde_json::json!({
        "generated_text": generated_text,
    })))
Olivier Dehaene's avatar
Olivier Dehaene committed
112
113
}

Olivier Dehaene's avatar
Olivier Dehaene committed
114
115
116
#[derive(Clone)]
struct ServerState {
    validation: Validation,
Olivier Dehaene's avatar
Olivier Dehaene committed
117
    batcher: Batcher,
Olivier Dehaene's avatar
Olivier Dehaene committed
118
119
}

Olivier Dehaene's avatar
Olivier Dehaene committed
120
121
pub async fn run(max_batch_size: usize, client: ShardedClient, tokenizer: Tokenizer, addr: SocketAddr) {
    let batcher = Batcher::new(client, max_batch_size);
Olivier Dehaene's avatar
Olivier Dehaene committed
122
123
    let validation = Validation::new(tokenizer);

Olivier Dehaene's avatar
Olivier Dehaene committed
124
    let shared_state = ServerState { validation, batcher };
Olivier Dehaene's avatar
Olivier Dehaene committed
125

Olivier Dehaene's avatar
Olivier Dehaene committed
126
127
128
    let app = Router::new()
        .route("/generate", post(generate))
        .layer(Extension(shared_state.clone()))
Olivier Dehaene's avatar
Olivier Dehaene committed
129
        .route("/health", get(liveness))
Olivier Dehaene's avatar
Olivier Dehaene committed
130
        .layer(Extension(shared_state.clone()));
Olivier Dehaene's avatar
Olivier Dehaene committed
131

Olivier Dehaene's avatar
Olivier Dehaene committed
132
    axum::Server::bind(&addr)
Olivier Dehaene's avatar
Olivier Dehaene committed
133
134
135
        .serve(app.into_make_service())
        .await
        .unwrap();
Olivier Dehaene's avatar
Olivier Dehaene committed
136
}