"vscode:/vscode.git/clone" did not exist on "db6b0542c7ca13edcbad0bb90687bf680a685844"
server.rs 3.55 KB
Newer Older
Olivier Dehaene's avatar
Olivier Dehaene committed
1
use crate::{Batcher, Validation};
Olivier Dehaene's avatar
Olivier Dehaene committed
2
use axum::extract::Extension;
Olivier Dehaene's avatar
Olivier Dehaene committed
3
use axum::http::StatusCode;
Olivier Dehaene's avatar
Olivier Dehaene committed
4
use axum::routing::{get, post};
Olivier Dehaene's avatar
Olivier Dehaene committed
5
use axum::{Json, Router};
Olivier Dehaene's avatar
Olivier Dehaene committed
6
use bloom_inference_client::ShardedClient;
Olivier Dehaene's avatar
Olivier Dehaene committed
7
use serde::Deserialize;
Olivier Dehaene's avatar
Olivier Dehaene committed
8
use std::net::SocketAddr;
Olivier Dehaene's avatar
Olivier Dehaene committed
9
use tokenizers::Tokenizer;
Olivier Dehaene's avatar
Olivier Dehaene committed
10
11
12
13
14
15
16
17
use tokio::time::Instant;
use tracing::instrument;

#[derive(Clone, Debug, Deserialize)]
pub(crate) struct GenerateParameters {
    #[serde(default = "default_temperature")]
    pub temperature: f32,
    #[serde(default = "default_top_k")]
Olivier Dehaene's avatar
Olivier Dehaene committed
18
    pub top_k: i32,
Olivier Dehaene's avatar
Olivier Dehaene committed
19
20
21
22
23
24
25
26
27
28
29
30
    #[serde(default = "default_top_p")]
    pub top_p: f32,
    #[serde(default = "default_do_sample")]
    pub do_sample: bool,
    #[serde(default = "default_max_new_tokens")]
    pub max_new_tokens: u32,
}

fn default_temperature() -> f32 {
    1.0
}

Olivier Dehaene's avatar
Olivier Dehaene committed
31
fn default_top_k() -> i32 {
Olivier Dehaene's avatar
Olivier Dehaene committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    0
}

fn default_top_p() -> f32 {
    1.0
}

fn default_do_sample() -> bool {
    false
}

fn default_max_new_tokens() -> u32 {
    20
}

fn default_parameters() -> GenerateParameters {
    GenerateParameters {
        temperature: default_temperature(),
        top_k: default_top_k(),
        top_p: default_top_p(),
        do_sample: default_do_sample(),
        max_new_tokens: default_max_new_tokens(),
    }
}

#[derive(Clone, Debug, Deserialize)]
pub(crate) struct GenerateRequest {
    pub inputs: String,
    #[serde(default = "default_parameters")]
    pub parameters: GenerateParameters,
}

Olivier Dehaene's avatar
Olivier Dehaene committed
64
#[instrument(skip(state), fields(time, time_per_token))]
Olivier Dehaene's avatar
Olivier Dehaene committed
65
66
async fn liveness(state: Extension<ServerState>) -> Result<(), (StatusCode, String)> {
    state
Olivier Dehaene's avatar
Olivier Dehaene committed
67
68
69
70
71
72
73
74
75
76
77
78
79
80
        .infer
        .infer(
            1,
            GenerateRequest {
                inputs: "liveness".to_string(),
                parameters: GenerateParameters {
                    temperature: 1.0,
                    top_k: 0,
                    top_p: 1.0,
                    do_sample: false,
                    max_new_tokens: 1,
                },
            },
        )
Olivier Dehaene's avatar
Olivier Dehaene committed
81
82
        .await?;
    Ok(())
Olivier Dehaene's avatar
Olivier Dehaene committed
83
84
}

Olivier Dehaene's avatar
Olivier Dehaene committed
85
#[instrument(skip(state), fields(time, time_per_token))]
Olivier Dehaene's avatar
Olivier Dehaene committed
86
async fn generate(
Olivier Dehaene's avatar
Olivier Dehaene committed
87
    state: Extension<ServerState>,
Olivier Dehaene's avatar
Olivier Dehaene committed
88
    req: Json<GenerateRequest>,
Olivier Dehaene's avatar
Olivier Dehaene committed
89
) -> Result<Json<serde_json::Value>, (StatusCode, String)> {
Olivier Dehaene's avatar
Olivier Dehaene committed
90
91
    let start = Instant::now();

Olivier Dehaene's avatar
Olivier Dehaene committed
92
    let (input_length, validated_request) = state
Olivier Dehaene's avatar
Olivier Dehaene committed
93
        .validation
Olivier Dehaene's avatar
Olivier Dehaene committed
94
        .validate(GenerateRequest {
Olivier Dehaene's avatar
Olivier Dehaene committed
95
96
97
            inputs: req.inputs.clone(),
            parameters: req.parameters.clone(),
        })
Olivier Dehaene's avatar
Olivier Dehaene committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
        .await?;

    let generated_text = state.infer.infer(input_length, validated_request).await?;

    tracing::Span::current().record("time", format!("{:?}", start.elapsed()));
    tracing::Span::current().record(
        "time_per_token",
        format!("{:?}", start.elapsed() / req.parameters.max_new_tokens),
    );
    tracing::info!("response: {}", generated_text);

    Ok(Json(serde_json::json!({
        "generated_text": generated_text,
    })))
Olivier Dehaene's avatar
Olivier Dehaene committed
112
113
}

Olivier Dehaene's avatar
Olivier Dehaene committed
114
115
116
117
118
119
#[derive(Clone)]
struct ServerState {
    validation: Validation,
    infer: Batcher,
}

Olivier Dehaene's avatar
Olivier Dehaene committed
120
pub async fn run(client: ShardedClient, tokenizer: Tokenizer, addr: SocketAddr) {
Olivier Dehaene's avatar
Olivier Dehaene committed
121
    client.clear_cache().await.expect("Unable to clear cache");
Olivier Dehaene's avatar
Olivier Dehaene committed
122
123
124
125
    tracing::info!("Connected");

    let infer = Batcher::new(client);

Olivier Dehaene's avatar
Olivier Dehaene committed
126
127
    let validation = Validation::new(tokenizer);

Olivier Dehaene's avatar
Olivier Dehaene committed
128
    let shared_state = ServerState { validation, infer };
Olivier Dehaene's avatar
Olivier Dehaene committed
129

Olivier Dehaene's avatar
Olivier Dehaene committed
130
131
132
    let app = Router::new()
        .route("/generate", post(generate))
        .layer(Extension(shared_state.clone()))
Olivier Dehaene's avatar
Olivier Dehaene committed
133
        .route("/health", get(liveness))
Olivier Dehaene's avatar
Olivier Dehaene committed
134
        .layer(Extension(shared_state.clone()));
Olivier Dehaene's avatar
Olivier Dehaene committed
135

Olivier Dehaene's avatar
Olivier Dehaene committed
136
    axum::Server::bind(&addr)
Olivier Dehaene's avatar
Olivier Dehaene committed
137
138
139
        .serve(app.into_make_service())
        .await
        .unwrap();
Olivier Dehaene's avatar
Olivier Dehaene committed
140
}