feat: add SchedulerV3 (#1996)

- Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl

feat: add SchedulerV3 (#1996)
- Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl
757223b3 · OlivierDehaene · GitHub · fec0167a · 757223b3 · 757223b3
Unverified Commit 757223b3 authored Jun 04, 2024 by OlivierDehaene Committed by GitHub Jun 04, 2024
20 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 [[package]]
 name = "addr2line"
-version = "0.21.0"
+version = "0.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
 dependencies = [
 "gimli",
 ]
@@ -350,9 +350,9 @@ dependencies = [
 [[package]]
 name = "backtrace"
-version = "0.3.71"
+version = "0.3.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d"
+checksum = "17c6a35df3749d2e8bb1b7b21a976d82b15548788d2735b9d82f329268f71a11"
 dependencies = [
 "addr2line",
 "cc",
@@ -1138,9 +1138,9 @@ dependencies = [
 [[package]]
 name = "gimli"
-version = "0.28.1"
+version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
 [[package]]
 name = "glob"
@@ -1396,9 +1396,9 @@ dependencies = [
 [[package]]
 name = "hyper-util"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d8d52be92d09acc2e01dddb7fde3ad983fc6489c7db4837e605bc3fca4cb63e"
+checksum = "7b875924a60b96e5d7b9ae7b066540b1dd1cbd90d1828f54c92e02a283351c56"
 dependencies = [
 "bytes",
 "futures-util",
@@ -1938,11 +1938,10 @@ dependencies = [
 [[package]]
 name = "native-tls"
-version = "0.2.11"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
 dependencies = [
- "lazy_static",
 "libc",
 "log",
 "openssl",
@@ -2168,9 +2167,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 [[package]]
 name = "object"
-version = "0.32.2"
+version = "0.35.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
+checksum = "b8ec7ab813848ba4522158d5517a6093db1ded27575b070f4177b8d12b41db5e"
 dependencies = [
 "memchr",
 ]
@@ -2563,9 +2562,9 @@ dependencies = [
 [[package]]
 name = "proc-macro2"
-version = "1.0.84"
+version = "1.0.85"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6"
+checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23"
 dependencies = [
 "unicode-ident",
 ]
@@ -3554,6 +3553,7 @@ dependencies = [
 name = "text-generation-client"
 version = "2.0.5-dev0"
 dependencies = [
+ "async-trait",
 "base64 0.22.1",
 "futures",
 "grpc-metadata",
@@ -3752,9 +3752,9 @@ dependencies = [
 [[package]]
 name = "tokio"
-version = "1.37.0"
+version = "1.38.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
+checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
 dependencies = [
 "backtrace",
 "bytes",
@@ -3781,9 +3781,9 @@ dependencies = [
 [[package]]
 name = "tokio-macros"
-version = "2.2.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -4733,9 +4733,9 @@ checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
 [[package]]
 name = "winnow"
-version = "0.6.8"
+version = "0.6.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3c52e9c97a68071b23e836c9380edae937f17b9c4667bd021973efc689f618d"
+checksum = "86c949fede1d13936a99f14fafd3e76fd642b556dd2ce96287fbe2e0151bfac6"
 dependencies = [
 "memchr",
 ]

--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
 use std::time::{Duration, Instant};
-use text_generation_client::{
+use text_generation_client::v3::{
-    Batch, CachedBatch, Chunk, ClientError, Input, NextTokenChooserParameters, Request,
+    Batch, CachedBatch, NextTokenChooserParameters, Request, ShardedClient,
-    ShardedClient, StoppingCriteriaParameters,
+    StoppingCriteriaParameters,
 };
+use text_generation_client::{Chunk, ClientError, Input};
 use tokenizers::{Tokenizer, TruncationDirection};
 use tokio::sync::{broadcast, mpsc};

--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@@ -8,7 +8,7 @@ use crate::app::App;
 use crate::event::Event;
 use crossterm::ExecutableCommand;
 use std::io;
-use text_generation_client::{GrammarType, NextTokenChooserParameters, ShardedClient};
+use text_generation_client::v3::{GrammarType, NextTokenChooserParameters, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::sync::{broadcast, mpsc};
 use tui::backend::CrosstermBackend;

--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -4,7 +4,7 @@
 /// and: https://github.com/orhun/rust-tui-template
 use clap::Parser;
 use std::path::Path;
-use text_generation_client::ShardedClient;
+use text_generation_client::v3::ShardedClient;
 use tokenizers::{FromPretrainedParameters, Tokenizer};
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;

--- a/proto/generate.proto
+++ b/proto/generate.proto
@@ -51,27 +51,6 @@ message ClearCacheRequest {
 /// Empty response
 message ClearCacheResponse {}
-message Image {
-    /// Binary image data.
-    bytes data = 1;
-    /// Image MIME type.
-    string mimetype = 2;
-}
-message InputChunk {
-    oneof chunk {
-        /// Plain text data
-        string text = 1;
-        /// Image data
-        Image image = 2;
-    }
-}
-message Input {
-    repeated InputChunk chunks = 1;
-  }
 enum GrammarType {
    GRAMMAR_TYPE_NONE = 0;
    GRAMMAR_TYPE_JSON = 1;
@@ -116,9 +95,7 @@ message StoppingCriteriaParameters {
 message Request {
    /// Request ID
    uint64 id = 1;
-    /// The generation context as chunks
+    /// The generation context
-    Input input_chunks = 8;
-    /// The generation context, stringified input_chunks
    string inputs = 2;
    /// Context truncation
    uint32 truncate = 3;

--- a/proto/v3/generate.proto
+++ b/proto/v3/generate.proto
+syntax = "proto3";
+package generate.v3;
+service TextGenerationService {
+    /// Model Info
+    rpc Info (InfoRequest) returns (InfoResponse) {}
+    /// Service discovery
+    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
+    /// Empties batch cache
+    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
+    /// Remove requests from a cached batch
+    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
+    /// Warmup the model and compute max cache size
+    rpc Warmup (WarmupRequest) returns (WarmupResponse);
+    /// Prefill batch and decode first token
+    rpc Prefill (PrefillRequest) returns (PrefillResponse);
+    /// Decode token for a list of prefilled batches
+    rpc Decode (DecodeRequest) returns (DecodeResponse);
+    /// Health check
+    rpc Health (HealthRequest) returns (HealthResponse);
+}
+message HealthRequest {}
+message HealthResponse {}
+/// Empty request
+message InfoRequest {}
+message InfoResponse {
+    bool requires_padding = 1;
+    string dtype = 2;
+    string device_type = 3;
+    optional uint32 window_size = 4;
+    uint32 speculate = 5;
+}
+/// Empty request
+message ServiceDiscoveryRequest {}
+message ServiceDiscoveryResponse {
+    /// Other shards urls
+    repeated string urls = 1;
+}
+message ClearCacheRequest {
+    /// Optional batch id
+    optional uint64 id = 1;
+}
+/// Empty response
+message ClearCacheResponse {}
+message Image {
+    /// Binary image data.
+    bytes data = 1;
+    /// Image MIME type.
+    string mimetype = 2;
+}
+message InputChunk {
+    oneof chunk {
+        /// Plain text data
+        string text = 1;
+        /// Image data
+        Image image = 2;
+    }
+}
+message Input {
+    repeated InputChunk chunks = 1;
+  }
+enum GrammarType {
+    GRAMMAR_TYPE_NONE = 0;
+    GRAMMAR_TYPE_JSON = 1;
+    GRAMMAR_TYPE_REGEX = 2;
+}
+message NextTokenChooserParameters {
+    /// exponential scaling output probability distribution
+    float temperature = 1;
+    /// restricting to the k highest probability elements
+    uint32 top_k = 2;
+    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    float top_p = 3;
+    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    float typical_p = 4;
+    /// apply sampling on the logits
+    bool do_sample = 5;
+    /// random seed for sampling
+    uint64 seed = 6;
+    /// repetition penalty
+    float repetition_penalty = 7;
+    /// frequency penalty
+    float frequency_penalty = 9;
+    /// token watermarking using "A Watermark for Large Language Models"
+    bool watermark = 8;
+    /// grammar (applied if not empty)
+    string grammar = 10;
+    /// grammar type
+    GrammarType grammar_type = 11;
+}
+message StoppingCriteriaParameters {
+    /// Maximum number of generated tokens
+    uint32 max_new_tokens = 1;
+    /// Optional stopping sequences
+    repeated string stop_sequences = 2;
+    /// Ignore end of sequence token
+    /// used for benchmarking
+    bool ignore_eos_token = 3;
+}
+message Request {
+    /// Request ID
+    uint64 id = 1;
+    /// The generation context as chunks
+    Input input_chunks = 8;
+    /// The generation context, stringified input_chunks
+    string inputs = 2;
+    /// Context truncation
+    uint32 truncate = 3;
+    /// Next Token Chooser Parameters
+    NextTokenChooserParameters parameters = 4;
+    /// Stopping Criteria Parameters
+    StoppingCriteriaParameters stopping_parameters = 5;
+    /// Return prefill logprobs
+    bool prefill_logprobs = 6;
+    /// Return most likely n tokens
+    uint32 top_n_tokens = 7;
+}
+message Batch {
+    /// Batch ID
+    uint64 id = 1;
+    /// Individual requests
+    repeated Request requests = 2;
+    /// Batch size (==len(requests))
+    uint32 size = 3;
+    /// Maximum number of tokens this batch will grow to
+    uint32 max_tokens = 4;
+}
+message CachedBatch {
+    /// Batch ID
+    uint64 id = 1;
+    /// Individual requests ids
+    repeated uint64 request_ids = 2;
+    /// Batch size (==len(requests))
+    uint32 size = 3;
+    /// Maximum number of tokens this batch will grow to
+    uint32 max_tokens = 4;
+}
+enum FinishReason {
+    FINISH_REASON_LENGTH = 0;
+    FINISH_REASON_EOS_TOKEN = 1;
+    FINISH_REASON_STOP_SEQUENCE = 2;
+}
+message GeneratedText {
+    /// Output
+    string text = 1;
+    /// Number of generated tokens
+    uint32 generated_tokens = 2;
+    /// Finish reason
+    FinishReason finish_reason = 3;
+    /// Seed
+    optional uint64 seed = 4;
+}
+message Tokens {
+    /// Token IDs
+    repeated uint32 ids = 1;
+    /// Logprobs
+    repeated float logprobs = 2;
+    /// tokens
+    repeated string texts = 3;
+    /// special
+    repeated bool is_special = 4;
+}
+message Generation {
+    /// Request ID
+    uint64 request_id = 1;
+    /// Prefill tokens (optional)
+    Tokens prefill_tokens = 2;
+    Tokens tokens = 3;
+    /// Complete generated text
+    optional GeneratedText generated_text = 4;
+    /// Top tokens
+    repeated Tokens top_tokens = 5;
+}
+message FilterBatchRequest {
+    /// Batch ID
+    uint64 batch_id = 1;
+    /// Requests to keep
+    repeated uint64 request_ids = 2;
+}
+message FilterBatchResponse {
+    /// Filtered Batch (cached)
+    CachedBatch batch = 1;
+}
+message PrefillRequest {
+    /// Batch
+    Batch batch = 1;
+}
+message PrefillResponse {
+    /// Generation
+    repeated Generation generations = 1;
+    /// Next batch (cached)
+    optional CachedBatch batch = 2;
+    /// Forward elapsed time in nanoseconds
+    uint64 forward_ns = 3;
+    /// Decode elapsed time in nanoseconds
+    uint64 decode_ns = 4;
+    /// Total elapsed time in nanoseconds
+    uint64 total_ns = 5;
+}
+message DecodeRequest {
+    /// Cached batches
+    repeated CachedBatch batches = 1;
+}
+message DecodeResponse {
+    /// Decodes
+    repeated Generation generations = 1;
+    /// Next batch (cached)
+    optional CachedBatch batch = 2;
+    /// Forward elapsed time in nanoseconds
+    uint64 forward_ns = 3;
+    /// Decode elapsed time in nanoseconds
+    uint64 decode_ns = 4;
+    /// Total elapsed time in nanoseconds
+    uint64 total_ns = 5;
+    /// Concatenate elapsed time in nanoseconds
+    optional uint64 concat_ns = 6;
+}
+message WarmupRequest {
+    /// Batch to warmup on
+    Batch batch = 1;
+    uint32 max_input_length = 2;
+    uint32 max_prefill_tokens = 3;
+    uint32 max_total_tokens = 4;
+}
+message WarmupResponse {
+    /// Maximum number of tokens supported by the model
+    optional uint32 max_supported_total_tokens = 1;
+}
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@@ -6,6 +6,7 @@ authors.workspace = true
 homepage.workspace = true
 [dependencies]
+async-trait = "^0.1"
 base64 = { workspace = true }
 futures = "^0.3"
 grpc-metadata = { path = "../grpc-metadata" }

--- a/router/client/build.rs
+++ b/router/client/build.rs
 use std::fs;
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("cargo:rerun-if-changed=../../proto/generate.proto");
+    println!("cargo:rerun-if-changed=../../proto/**");
-    fs::create_dir("src/pb").unwrap_or(());
+    fs::create_dir_all("src/v2/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
    config.protoc_arg("--experimental_allow_proto3_optional");
    tonic_build::configure()
        .build_client(true)
        .build_server(false)
-        .out_dir("src/pb")
+        .out_dir("src/v2/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+    fs::create_dir_all("src/v3/pb").unwrap_or(());
+    let mut config = prost_build::Config::new();
+    config.protoc_arg("--experimental_allow_proto3_optional");
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/v3/pb")
+        .include_file("mod.rs")
+        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
+        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
    Ok(())
 }
--- a/router/client/src/lib.rs
+++ b/router/client/src/lib.rs
 //! Text Generation gRPC client library
-mod client;
+pub mod v2;
-#[allow(clippy::derive_partial_eq_without_eq)]
+pub mod v3;
-mod pb;
-mod sharded_client;
+use async_trait::async_trait;
 use base64::{engine::general_purpose::STANDARD, Engine};
-pub use client::Client;
-pub use pb::generate::v2::input_chunk::Chunk;
-pub use pb::generate::v2::HealthResponse;
-pub use pb::generate::v2::Image;
-pub use pb::generate::v2::InfoResponse as ShardInfo;
-pub use pb::generate::v2::{
-    Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType, Input, InputChunk,
-    NextTokenChooserParameters, Request, StoppingCriteriaParameters, Tokens,
-};
-pub use sharded_client::ShardedClient;
 use thiserror::Error;
 use tonic::transport;
 use tonic::Status;
+pub use v3::{Chunk, Image, Input, InputChunk};
+#[async_trait]
+pub trait Health {
+    /// Check if a generate server is healthy by asking it to allocate a tensor on device
+    async fn device_health(&self) -> Result<()>;
+    /// Check if a generate server is healthy by doing a forward pass.
+    /// EXPENSIVE
+    async fn model_health(&self) -> Result<()>;
+}
+#[derive(Debug)]
+pub struct ShardInfo {
+    pub requires_padding: bool,
+    pub dtype: String,
+    pub device_type: String,
+    pub window_size: Option<u32>,
+    pub speculate: u32,
+}
 #[derive(Error, Debug, Clone)]
 pub enum ClientError {
    #[error("Could not connect to Text Generation server: {0}")]
@@ -46,8 +56,6 @@ impl From<transport::Error> for ClientError {
    }
 }
-pub type Result<T> = std::result::Result<T, ClientError>;
 // Small convenience re-wrapping of `Chunk`.
 impl From<Chunk> for InputChunk {
    fn from(chunk: Chunk) -> Self {
@@ -77,3 +85,7 @@ impl ChunksToString for Vec<InputChunk> {
        output
    }
 }
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+pub type Result<T> = std::result::Result<T, ClientError>;
--- a/router/client/src/pb/.gitignore
+++ b/router/client/src/pb/.gitignore
-*.rs
--- a/router/client/src/v2/client.rs
+++ b/router/client/src/v2/client.rs
+/// Single shard Client
+use crate::v2::pb;
+use crate::{ClientError, Result};
+use crate::WARMUP_IMAGE_BASE64;
+use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v2::*;
+use std::cmp::min;
+use std::time::Duration;
+use tonic::transport::{Channel, Uri};
+use tracing::instrument;
+/// Text Generation Inference gRPC client
+#[derive(Debug, Clone)]
+pub struct Client {
+    stub: TextGenerationServiceClient<Channel>,
+}
+impl Client {
+    /// Returns a client connected to the given url
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let channel = Channel::builder(uri).connect().await?;
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let channel = Channel::from_shared("http://[::]:50051".to_string())
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                tokio::net::UnixStream::connect(path.clone())
+            }))
+            .await?;
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+    /// Returns a list of uris or unix sockets of all shards
+    #[instrument(skip(self))]
+    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
+        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v2 interface".to_string())
+        })?;
+        let urls = response
+            .into_inner()
+            .urls
+            .into_iter()
+            // Remove unix socket prefix
+            .map(|url| match url.strip_prefix("unix://") {
+                None => url,
+                Some(stripped_url) => stripped_url.to_string(),
+            })
+            .collect();
+        Ok(urls)
+    }
+    /// Get model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<InfoResponse> {
+        let request = tonic::Request::new(InfoRequest {}).inject_context();
+        let response = self.stub.info(request).await?.into_inner();
+        Ok(response)
+    }
+    /// Get model health
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let request = tonic::Request::new(HealthRequest {}).inject_context();
+        let response = self.stub.health(request).await?.into_inner();
+        Ok(response)
+    }
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
+        self.stub.clear_cache(request).await?;
+        Ok(())
+    }
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let request = tonic::Request::new(FilterBatchRequest {
+            batch_id,
+            request_ids,
+        })
+        .inject_context();
+        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
+        Ok(filtered_batch.batch)
+    }
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip_all)]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let mut n_tokens = 0;
+        let mut requests = Vec::new();
+        // Create requests
+        while n_tokens < max_prefill_tokens {
+            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+            requests.push(Request {
+                id: 0,
+                inputs,
+                // We truncate the input on the server side to be sure that it has the correct size
+                truncate,
+                // Set sampling parameters to also take these ops into account in the max memory
+                parameters: Some(NextTokenChooserParameters {
+                    temperature: 0.9,
+                    top_k: 10,
+                    top_p: 0.9,
+                    typical_p: 0.9,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
+                    watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
+                }),
+                stopping_parameters: Some(StoppingCriteriaParameters {
+                    max_new_tokens: max_total_tokens - truncate,
+                    stop_sequences: vec![],
+                    ignore_eos_token: true,
+                }),
+                prefill_logprobs: true,
+                top_n_tokens: 20,
+            });
+            n_tokens += max_input_length;
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
+        }
+        let batch = Batch {
+            id: 0,
+            size: requests.len() as u32,
+            requests,
+            max_tokens: 0,
+        };
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_length,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
+        let response = self.stub.warmup(request).await?.into_inner();
+        Ok(response.max_supported_total_tokens)
+    }
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
+        let response = self.stub.prefill(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+        ))
+    }
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
+        let response = self.stub.decode(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+pub struct PrefillTimings {
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+impl PrefillTimings {
+    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
--- a/router/client/src/v2/mod.rs
+++ b/router/client/src/v2/mod.rs
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+mod client;
+mod sharded_client;
+pub use client::Client;
+pub use pb::generate::v2::HealthResponse;
+pub use pb::generate::v2::{
+    Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType, InfoResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters, Tokens,
+};
+pub use sharded_client::ShardedClient;
--- a/router/client/src/v2/pb/.gitignore
+++ b/router/client/src/v2/pb/.gitignore
+*
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
-use crate::client::{DecodeTimings, PrefillTimings};
 /// Multi shard Client
-use crate::{Batch, CachedBatch, Client, Generation, HealthResponse, ShardInfo};
+use crate::{v2, Health, ShardInfo};
 use crate::{ClientError, Result};
+use crate::v2::InfoResponse;
+use async_trait::async_trait;
 use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;
+use v2::client::{DecodeTimings, PrefillTimings};
+use v2::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
 #[derive(Debug, Clone)]
 /// Text Generation Inference gRPC multi client
@@ -47,7 +54,7 @@ impl ShardedClient {
            .iter_mut()
            .map(|client| client.info())
            .collect();
-        join_all(futures).await.pop().unwrap()
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
    }
    /// GRPC health check
@@ -185,3 +192,60 @@ impl ShardedClient {
        Ok((generations, next_batch, timings))
    }
 }
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
+    }
+}
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
+use crate::v3::{pb, Chunk};
+use crate::{ClientError, Result, WARMUP_IMAGE_BASE64};
 /// Single shard Client
-use crate::pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
-use crate::pb::generate::v2::*;
-use crate::{Chunk, Result};
 use base64::engine::general_purpose::STANDARD;
 use base64::Engine;
 use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v3::*;
 use std::cmp::min;
 use std::time::Duration;
 use tonic::transport::{Channel, Uri};
 use tracing::instrument;
-static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
 /// Text Generation Inference gRPC client
 #[derive(Debug, Clone)]
 pub struct Client {
@@ -46,7 +45,9 @@ impl Client {
    #[instrument(skip(self))]
    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
-        let response = self.stub.service_discovery(request).await?;
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v3 interface".to_string())
+        })?;
        let urls = response
            .into_inner()
            .urls
@@ -133,6 +134,7 @@ impl Client {
            // Send stringly-typed inputs for compatibility for backends that haven't
            // been updated to support chunks.
            let mut inputs = String::new();
            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
            if n_tokens == 0 {
@@ -145,10 +147,10 @@ impl Client {
            requests.push(Request {
                id: 0,
+                inputs,
                input_chunks: Some(Input {
                    chunks: input_chunks,
                }),
-                inputs,
                // We truncate the input on the server side to be sure that it has the correct size
                truncate,
                // Set sampling parameters to also take these ops into account in the max memory

--- a/router/client/src/v3/mod.rs
+++ b/router/client/src/v3/mod.rs
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+mod client;
+mod sharded_client;
+pub use client::Client;
+pub use pb::generate::v3::{
+    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
+    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
+    StoppingCriteriaParameters, Tokens,
+};
+pub use sharded_client::ShardedClient;
--- a/router/client/src/v3/pb/.gitignore
+++ b/router/client/src/v3/pb/.gitignore
+*
--- a/router/client/src/v3/sharded_client.rs
+++ b/router/client/src/v3/sharded_client.rs
+/// Multi shard Client
+use crate::{v3, Health, ShardInfo};
+use crate::{ClientError, Result};
+use crate::v3::{Chunk, InfoResponse, Input};
+use async_trait::async_trait;
+use futures::future::join_all;
+use tonic::transport::Uri;
+use tracing::instrument;
+use v3::client::{DecodeTimings, PrefillTimings};
+use v3::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+#[derive(Debug, Clone)]
+/// Text Generation Inference gRPC multi client
+pub struct ShardedClient {
+    clients: Vec<Client>,
+}
+impl ShardedClient {
+    fn new(clients: Vec<Client>) -> Self {
+        Self { clients }
+    }
+    /// Create a new ShardedClient from a master client. The master client will communicate with
+    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
+    async fn from_master_client(mut master_client: Client) -> Result<Self> {
+        // Get all uris/unix sockets from the master client
+        let uris = master_client.service_discovery().await?;
+        let futures = uris.into_iter().map(Client::connect_uds);
+        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
+        Ok(Self::new(clients?))
+    }
+    /// Returns a client connected to the given uri
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let master_client = Client::connect(uri).await?;
+        Self::from_master_client(master_client).await
+    }
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let master_client = Client::connect_uds(path).await?;
+        Self::from_master_client(master_client).await
+    }
+    /// Get the model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<ShardInfo> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.info())
+            .collect();
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
+    }
+    /// GRPC health check
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.health())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.clear_cache(batch_id))
+            .collect();
+        join_all(futures).await.into_iter().collect()
+    }
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
+            .collect();
+        // all shards return the same message
+        join_all(futures).await.pop().unwrap()
+    }
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip(self))]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| {
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
+            })
+            .collect();
+        // Take the minimum value
+        let results = join_all(futures)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<Option<u32>>>>()?;
+        Ok(results.into_iter().flatten().min())
+    }
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.prefill(batch.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.decode(batches.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+}
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text("liveness".into()).into()],
+            }),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
+    }
+}
--- a/router/src/health.rs
+++ b/router/src/health.rs
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-use text_generation_client::{
-    Batch, Input, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
-};
-use text_generation_client::{Chunk, GrammarType as ProtoGrammarType};
-// Note: Request ids and batch ids cannot collide.
-const LIVENESS_ID: u64 = u64::MAX;
-const BATCH_ID: u64 = u64::MAX;
-#[derive(Clone, Debug)]
-pub(crate) struct Health {
-    client: ShardedClient,
-    generation_health: Arc<AtomicBool>,
-}
-impl Health {
-    pub(crate) fn new(client: ShardedClient, generation_health: Arc<AtomicBool>) -> Self {
-        Self {
-            client,
-            generation_health,
-        }
-    }
-    pub(crate) async fn check(&mut self) -> bool {
-        if self.generation_health.load(Ordering::SeqCst) {
-            // Generation is healthy, we only check that the shards are answering gRPC calls
-            self.client.health().await.is_ok()
-        } else {
-            // Generation is unhealthy or have not sent any generation request yet
-            // Dummy batch of 1 token and 1 generated token
-            let liveness_request = Request {
-                id: LIVENESS_ID,
-                input_chunks: Some(Input {
-                    chunks: vec![Chunk::Text("liveness".into()).into()],
-                }),
-                inputs: "liveness".to_string(),
-                truncate: 10,
-                prefill_logprobs: false,
-                parameters: Some(NextTokenChooserParameters {
-                    temperature: 1.0,
-                    top_k: 0,
-                    top_p: 1.0,
-                    typical_p: 1.0,
-                    do_sample: false,
-                    seed: 0,
-                    repetition_penalty: 1.0,
-                    frequency_penalty: 0.0,
-                    watermark: false,
-                    grammar: String::new(),
-                    grammar_type: ProtoGrammarType::None as i32,
-                }),
-                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: 1,
-                    stop_sequences: vec![],
-                    ignore_eos_token: false,
-                }),
-                top_n_tokens: 0,
-            };
-            let batch = Batch {
-                id: BATCH_ID,
-                requests: vec![liveness_request],
-                size: 1,
-                max_tokens: 2,
-            };
-            // Skips the queue
-            let value = self.client.prefill(batch).await.is_ok();
-            // Update generation health
-            self.generation_health.store(value, Ordering::SeqCst);
-            value
-        }
-    }
-}
--- a/router/src/infer/health.rs
+++ b/router/src/infer/health.rs
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use text_generation_client::Health;
+#[derive(Clone)]
+pub(crate) struct HealthCheck {
+    client: Arc<dyn Health + Send + Sync>,
+    generation_health: Arc<AtomicBool>,
+}
+impl HealthCheck {
+    pub(crate) fn new(
+        client: Arc<dyn Health + Send + Sync>,
+        generation_health: Arc<AtomicBool>,
+    ) -> Self {
+        Self {
+            client,
+            generation_health,
+        }
+    }
+    pub(crate) async fn check(&mut self) -> bool {
+        let value = if self.generation_health.load(Ordering::SeqCst) {
+            // Generation is healthy, we only check that the shards can allocate on device
+            self.client.device_health().await
+        } else {
+            self.client.model_health().await
+        }
+        .is_ok();
+        // Update generation health
+        self.generation_health.store(value, Ordering::SeqCst);
+        value
+    }
+}