update

e019635f · xuxzh1 · 64def8e2 · 64def8e2 · 64def8e2 · 64def8e2
Commit e019635f authored Nov 01, 2024 by xuxzh1 🎱
20 changed files
--- a/load_tests/starcoder_load.js
+++ b/load_tests/starcoder_load.js
-import {check} from 'k6';
-import http from 'k6/http';
-import {Trend} from 'k6/metrics';
-
-const host = __ENV.HOST || '127.0.0.1:3000';
-
-const totalTime = new Trend('total_time', true);
-const validationTime = new Trend('validation_time', true);
-const queueTime = new Trend('queue_time', true);
-const inferenceTime = new Trend('inference_time', true);
-const timePerToken = new Trend('time_per_token', true);
-
-const example = {
-    payload: JSON.stringify({
-        inputs: '# This is a fibonacci function written in the Python programming language.' +
-            'def fibonacci',
-        parameters: {
-            details: true,
-            max_new_tokens: 60,
-            temperature: 0.2,
-            top_p: 0.95,
-            seed: 0,
-        },
-    }),
-    generated_tokens: 60
-};
-
-export const options = {
-    thresholds: {
-        http_req_failed: ['rate==0'],
-        time_per_token: ['p(95)<90'],
-        queue_time: ['p(95)<1500'],
-    },
-    scenarios: {
-        load_test: {
-            executor: 'constant-arrival-rate',
-            duration: '60s',
-            preAllocatedVUs: 100,
-            rate: 10,
-            timeUnit: '1s',
-        },
-    },
-};
-
-export default function () {
-    const headers = {'Content-Type': 'application/json'};
-    const res = http.post(`http://${host}/generate`, example.payload, {
-        headers,
-    });
-
-    check(res, {
-        'Post status is 200': (r) => res.status === 200,
-        'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens,
-    });
-
-    if (res.status === 200) {
-        totalTime.add(res.headers["X-Total-Time"]);
-        validationTime.add(res.headers["X-Validation-Time"]);
-        queueTime.add(res.headers["X-Queue-Time"]);
-        inferenceTime.add(res.headers["X-Inference-Time"]);
-        timePerToken.add(res.headers["X-Time-Per-Token"]);
-    }
-}
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
-/// Single shard Client
-use crate::pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
-use crate::pb::generate::v2::*;
-use crate::Result;
-use grpc_metadata::InjectTelemetryContext;
-use std::cmp::min;
-use std::time::Duration;
-use tonic::transport::{Channel, Uri};
-use tracing::instrument;
-
-/// Text Generation Inference gRPC client
-#[derive(Debug, Clone)]
-pub struct Client {
-    stub: TextGenerationServiceClient<Channel>,
-}
-
-impl Client {
-    /// Returns a client connected to the given url
-    pub async fn connect(uri: Uri) -> Result<Self> {
-        let channel = Channel::builder(uri).connect().await?;
-
-        Ok(Self {
-            stub: TextGenerationServiceClient::new(channel),
-        })
-    }
-
-    /// Returns a client connected to the given unix socket
-    pub async fn connect_uds(path: String) -> Result<Self> {
-        let channel = Channel::from_shared("http://[::]:50051".to_string())
-            .unwrap()
-            .connect_with_connector(tower::service_fn(move |_: Uri| {
-                tokio::net::UnixStream::connect(path.clone())
-            }))
-            .await?;
-
-        Ok(Self {
-            stub: TextGenerationServiceClient::new(channel),
-        })
-    }
-
-    /// Returns a list of uris or unix sockets of all shards
-    #[instrument(skip(self))]
-    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
-        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
-        let response = self.stub.service_discovery(request).await?;
-        let urls = response
-            .into_inner()
-            .urls
-            .into_iter()
-            // Remove unix socket prefix
-            .map(|url| match url.strip_prefix("unix://") {
-                None => url,
-                Some(stripped_url) => stripped_url.to_string(),
-            })
-            .collect();
-        Ok(urls)
-    }
-
-    /// Get model info
-    #[instrument(skip(self))]
-    pub async fn info(&mut self) -> Result<InfoResponse> {
-        let request = tonic::Request::new(InfoRequest {}).inject_context();
-        let response = self.stub.info(request).await?.into_inner();
-        Ok(response)
-    }
-
-    /// Get model health
-    #[instrument(skip(self))]
-    pub async fn health(&mut self) -> Result<HealthResponse> {
-        let request = tonic::Request::new(HealthRequest {}).inject_context();
-        let response = self.stub.health(request).await?.into_inner();
-        Ok(response)
-    }
-
-    /// Clear the past generations cache
-    #[instrument(skip(self))]
-    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
-        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
-        self.stub.clear_cache(request).await?;
-        Ok(())
-    }
-
-    /// Filter a cached batch
-    #[instrument(skip(self))]
-    pub async fn filter_batch(
-        &mut self,
-        batch_id: u64,
-        request_ids: Vec<u64>,
-    ) -> Result<Option<CachedBatch>> {
-        let request = tonic::Request::new(FilterBatchRequest {
-            batch_id,
-            request_ids,
-        })
-        .inject_context();
-        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
-        Ok(filtered_batch.batch)
-    }
-
-    /// Warmup on a max size batch
-    ///
-    /// Returns the maximum amount of tokens supported by the hardware
-    #[instrument(skip_all)]
-    pub async fn warmup(
-        &mut self,
-        max_input_length: u32,
-        max_prefill_tokens: u32,
-        max_total_tokens: u32,
-        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
-        let mut n_tokens = 0;
-        let mut requests = Vec::new();
-        // Create requests
-        while n_tokens < max_prefill_tokens {
-            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
-
-            let mut inputs = String::new();
-            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
-            if n_tokens == 0 {
-                // 1 request is enough to test vision heads.
-                // Sending images on other queries messes up easily with truncation.
-                inputs.push_str("![](data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=)");
-            }
-
-            requests.push(Request {
-                id: 0,
-                // We truncate the input on the server side to be sure that it has the correct size
-                inputs,
-                truncate,
-                // Set sampling parameters to also take these ops into account in the max memory
-                parameters: Some(NextTokenChooserParameters {
-                    temperature: 0.9,
-                    top_k: 10,
-                    top_p: 0.9,
-                    typical_p: 0.9,
-                    do_sample: false,
-                    seed: 0,
-                    repetition_penalty: 1.2,
-                    frequency_penalty: 0.1,
-                    watermark: true,
-                    grammar: String::new(),
-                    grammar_type: GrammarType::None as i32,
-                }),
-                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: max_total_tokens - truncate,
-                    stop_sequences: vec![],
-                    ignore_eos_token: true,
-                }),
-                prefill_logprobs: true,
-                top_n_tokens: 20,
-            });
-            n_tokens += max_input_length;
-
-            // Check max_batch_size
-            if Some(requests.len()) == max_batch_size {
-                break;
-            }
-        }
-
-        let batch = Batch {
-            id: 0,
-            size: requests.len() as u32,
-            requests,
-            max_tokens: 0,
-        };
-
-        let request = tonic::Request::new(WarmupRequest {
-            batch: Some(batch),
-            max_input_length,
-            max_prefill_tokens,
-            max_total_tokens,
-        })
-        .inject_context();
-        let response = self.stub.warmup(request).await?.into_inner();
-        Ok(response.max_supported_total_tokens)
-    }
-
-    /// Generate one token for each request in the given batch
-    ///
-    /// Returns Generation for each request in batch
-    /// and the next cached batch
-    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
-    pub async fn prefill(
-        &mut self,
-        batch: Batch,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
-        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
-        let response = self.stub.prefill(request).await?.into_inner();
-        Ok((
-            response.generations,
-            response.batch,
-            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
-        ))
-    }
-
-    /// Generate one token for each request in the given cached batches
-    ///
-    /// Returns Generation for each request in batches
-    /// and the next cached batch
-    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
-    pub async fn decode(
-        &mut self,
-        batches: Vec<CachedBatch>,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
-        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
-        let response = self.stub.decode(request).await?.into_inner();
-        Ok((
-            response.generations,
-            response.batch,
-            DecodeTimings::new(
-                response.concat_ns,
-                response.forward_ns,
-                response.decode_ns,
-                response.total_ns,
-            ),
-        ))
-    }
-}
-
-pub struct PrefillTimings {
-    pub forward: Duration,
-    pub decode: Duration,
-    pub total: Duration,
-}
-
-impl PrefillTimings {
-    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
-        Self {
-            forward: Duration::from_nanos(forward_ns),
-            decode: Duration::from_nanos(decode_ns),
-            total: Duration::from_nanos(total_ns),
-        }
-    }
-}
-
-pub struct DecodeTimings {
-    pub concat: Option<Duration>,
-    pub forward: Duration,
-    pub decode: Duration,
-    pub total: Duration,
-}
-
-impl DecodeTimings {
-    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
-        Self {
-            concat: concat_ns.map(Duration::from_nanos),
-            forward: Duration::from_nanos(forward_ns),
-            decode: Duration::from_nanos(decode_ns),
-            total: Duration::from_nanos(total_ns),
-        }
-    }
-}
--- a/router/client/src/pb/.gitignore
+++ b/router/client/src/pb/.gitignore
-*.rs
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
-use crate::client::{DecodeTimings, PrefillTimings};
-/// Multi shard Client
-use crate::{Batch, CachedBatch, Client, Generation, HealthResponse, ShardInfo};
-use crate::{ClientError, Result};
-use futures::future::join_all;
-use tonic::transport::Uri;
-use tracing::instrument;
-
-#[derive(Debug, Clone)]
-/// Text Generation Inference gRPC multi client
-pub struct ShardedClient {
-    clients: Vec<Client>,
-}
-
-impl ShardedClient {
-    fn new(clients: Vec<Client>) -> Self {
-        Self { clients }
-    }
-
-    /// Create a new ShardedClient from a master client. The master client will communicate with
-    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
-    async fn from_master_client(mut master_client: Client) -> Result<Self> {
-        // Get all uris/unix sockets from the master client
-        let uris = master_client.service_discovery().await?;
-        let futures = uris.into_iter().map(Client::connect_uds);
-        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
-        Ok(Self::new(clients?))
-    }
-
-    /// Returns a client connected to the given uri
-    pub async fn connect(uri: Uri) -> Result<Self> {
-        let master_client = Client::connect(uri).await?;
-        Self::from_master_client(master_client).await
-    }
-
-    /// Returns a client connected to the given unix socket
-    pub async fn connect_uds(path: String) -> Result<Self> {
-        let master_client = Client::connect_uds(path).await?;
-        Self::from_master_client(master_client).await
-    }
-
-    /// Get the model info
-    #[instrument(skip(self))]
-    pub async fn info(&mut self) -> Result<ShardInfo> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| client.info())
-            .collect();
-        join_all(futures).await.pop().unwrap()
-    }
-
-    /// GRPC health check
-    #[instrument(skip(self))]
-    pub async fn health(&mut self) -> Result<HealthResponse> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| client.health())
-            .collect();
-        join_all(futures).await.pop().unwrap()
-    }
-
-    /// Clear the past generations cache
-    #[instrument(skip(self))]
-    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| client.clear_cache(batch_id))
-            .collect();
-        join_all(futures).await.into_iter().collect()
-    }
-
-    /// Filter a cached batch
-    #[instrument(skip(self))]
-    pub async fn filter_batch(
-        &mut self,
-        batch_id: u64,
-        request_ids: Vec<u64>,
-    ) -> Result<Option<CachedBatch>> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
-            .collect();
-        // all shards return the same message
-        join_all(futures).await.pop().unwrap()
-    }
-
-    /// Warmup on a max size batch
-    ///
-    /// Returns the maximum amount of tokens supported by the hardware
-    #[instrument(skip(self))]
-    pub async fn warmup(
-        &mut self,
-        max_input_length: u32,
-        max_prefill_tokens: u32,
-        max_total_tokens: u32,
-        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| {
-                Box::pin(client.warmup(
-                    max_input_length,
-                    max_prefill_tokens,
-                    max_total_tokens,
-                    max_batch_size,
-                ))
-            })
-            .collect();
-        // Take the minimum value
-        let results = join_all(futures)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<Option<u32>>>>()?;
-        Ok(results.into_iter().flatten().min())
-    }
-
-    /// Generate one token for each request in the given batch
-    ///
-    /// Returns Generation for each request in batch
-    /// and the next cached batch
-    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
-    pub async fn prefill(
-        &mut self,
-        batch: Batch,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| Box::pin(client.prefill(batch.clone())))
-            .collect();
-        #[allow(clippy::type_complexity)]
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
-            join_all(futures).await.into_iter().collect();
-        let mut results = results?;
-
-        let (mut generations, next_batch, mut timings) =
-            results.pop().ok_or(ClientError::EmptyResults)?;
-
-        // Merge generations from different model shards
-        for (mut shard_generations, _, shard_timings) in results.into_iter() {
-            generations.append(&mut shard_generations);
-            // Return the timings of the slowest shard
-            if shard_timings.total > timings.total {
-                timings = shard_timings;
-            }
-        }
-        Ok((generations, next_batch, timings))
-    }
-
-    /// Generate one token for each request in the given cached batches
-    ///
-    /// Returns Generation for each request in batches
-    /// and the next cached batch
-    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
-    pub async fn decode(
-        &mut self,
-        batches: Vec<CachedBatch>,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| Box::pin(client.decode(batches.clone())))
-            .collect();
-        #[allow(clippy::type_complexity)]
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
-            join_all(futures).await.into_iter().collect();
-        let mut results = results?;
-
-        let (mut generations, next_batch, mut timings) =
-            results.pop().ok_or(ClientError::EmptyResults)?;
-
-        // Merge generations from different model shards
-        for (mut shard_generations, _, shard_timings) in results.into_iter() {
-            generations.append(&mut shard_generations);
-            // Return the timings of the slowest shard
-            if shard_timings.total > timings.total {
-                timings = shard_timings;
-            }
-        }
-        Ok((generations, next_batch, timings))
-    }
-}
--- a/router/src/health.rs
+++ b/router/src/health.rs
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-use text_generation_client::GrammarType as ProtoGrammarType;
-use text_generation_client::{
-    Batch, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
-};
-
-// Note: Request ids and batch ids cannot collide.
-const LIVENESS_ID: u64 = u64::MAX;
-const BATCH_ID: u64 = u64::MAX;
-
-#[derive(Clone, Debug)]
-pub(crate) struct Health {
-    client: ShardedClient,
-    generation_health: Arc<AtomicBool>,
-}
-
-impl Health {
-    pub(crate) fn new(client: ShardedClient, generation_health: Arc<AtomicBool>) -> Self {
-        Self {
-            client,
-            generation_health,
-        }
-    }
-
-    pub(crate) async fn check(&mut self) -> bool {
-        if self.generation_health.load(Ordering::SeqCst) {
-            // Generation is healthy, we only check that the shards are answering gRPC calls
-            self.client.health().await.is_ok()
-        } else {
-            // Generation is unhealthy or have not sent any generation request yet
-
-            // Dummy batch of 1 token and 1 generated token
-            let liveness_request = Request {
-                id: LIVENESS_ID,
-                inputs: "liveness".to_string(),
-                truncate: 10,
-                prefill_logprobs: false,
-                parameters: Some(NextTokenChooserParameters {
-                    temperature: 1.0,
-                    top_k: 0,
-                    top_p: 1.0,
-                    typical_p: 1.0,
-                    do_sample: false,
-                    seed: 0,
-                    repetition_penalty: 1.0,
-                    frequency_penalty: 0.0,
-                    watermark: false,
-                    grammar: String::new(),
-                    grammar_type: ProtoGrammarType::None as i32,
-                }),
-                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: 1,
-                    stop_sequences: vec![],
-                    ignore_eos_token: false,
-                }),
-                top_n_tokens: 0,
-            };
-            let batch = Batch {
-                id: BATCH_ID,
-                requests: vec![liveness_request],
-                size: 1,
-                max_tokens: 2,
-            };
-            // Skips the queue
-            let value = self.client.prefill(batch).await.is_ok();
-            // Update generation health
-            self.generation_health.store(value, Ordering::SeqCst);
-            value
-        }
-    }
-}
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
-/// Batching and inference logic
-use crate::validation::{Validation, ValidationError};
-use crate::{
-    ChatTemplateInputs, ChatTemplateVersions, Entry, GenerateRequest, GenerateStreamResponse,
-    HubTokenizerConfig, Message, PrefillToken, Queue, Token,
-};
-use crate::{FunctionRef, FunctionsMap, GrammarType, Properties, Tool, ToolType, Tools};
-use futures::future::try_join_all;
-use minijinja::{Environment, ErrorKind, Template};
-use nohash_hasher::IntMap;
-use serde_json::{json, Map, Value};
-use std::collections::HashMap;
-use std::sync::{
-    atomic::{AtomicBool, Ordering},
-    Arc,
-};
-use text_generation_client::{
-    Batch, CachedBatch, ClientError, GeneratedText, Generation, ShardedClient, Tokens,
-};
-use thiserror::Error;
-use tokio::sync::mpsc::error::SendError;
-use tokio::sync::{mpsc, Notify, Semaphore, TryAcquireError};
-use tokio::time::Instant;
-use tokio_stream::wrappers::UnboundedReceiverStream;
-use tokio_stream::StreamExt;
-use tracing::{info_span, instrument, Instrument, Span};
-
-/// Inference struct
-#[derive(Clone)]
-pub struct Infer {
-    /// Validation
-    validation: Validation,
-    /// Request queue
-    queue: Queue,
-    /// Shared state
-    shared: Arc<Shared>,
-    /// Chat template
-    chat_template: Option<ChatTemplate>,
-    /// Inference limit
-    limit_concurrent_requests: Arc<Semaphore>,
-}
-
-/// Infer shared state
-struct Shared {
-    /// Batching background Tokio task notifier
-    batching_task: Notify,
-}
-
-/// Raise a exception (custom function) used in the chat templates
-fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
-    Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
-}
-
-impl Infer {
-    #[allow(clippy::too_many_arguments)]
-    pub(crate) fn new(
-        client: ShardedClient,
-        validation: Validation,
-        waiting_served_ratio: f32,
-        max_batch_prefill_tokens: u32,
-        max_batch_total_tokens: u32,
-        max_waiting_tokens: usize,
-        max_batch_size: Option<usize>,
-        max_concurrent_requests: usize,
-        requires_padding: bool,
-        window_size: Option<u32>,
-        speculate: u32,
-        generation_health: Arc<AtomicBool>,
-        tokenizer_config: HubTokenizerConfig,
-    ) -> Self {
-        // Infer shared state
-        let queue = Queue::new(requires_padding, 16, window_size, speculate);
-        let shared = Arc::new(Shared {
-            batching_task: Notify::new(),
-        });
-
-        // Spawn batching background task that contains all the inference logic
-        tokio::spawn(batching_task(
-            client,
-            waiting_served_ratio,
-            max_batch_prefill_tokens,
-            max_batch_total_tokens,
-            max_waiting_tokens,
-            max_batch_size,
-            queue.clone(),
-            shared.clone(),
-            generation_health,
-        ));
-
-        let chat_template = tokenizer_config
-            .chat_template
-            .and_then(|t| match t {
-                ChatTemplateVersions::Single(template) => Some(template),
-                ChatTemplateVersions::Multiple(templates) => templates
-                    .into_iter()
-                    .find(|t| t.name == "default")
-                    .map(|t| t.template),
-            })
-            .map(|t| {
-                // .strip() is not supported in minijinja
-                let t = t.replace(".strip()", " | trim");
-                ChatTemplate::new(t, tokenizer_config.bos_token, tokenizer_config.eos_token)
-            });
-
-        // Inference limit with a semaphore
-        let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));
-
-        Self {
-            validation,
-            queue,
-            shared,
-            chat_template,
-            limit_concurrent_requests: semaphore,
-        }
-    }
-
-    /// Add a new request to the queue and return a stream of InferStreamResponse
-    #[instrument(skip_all)]
-    pub(crate) async fn generate_stream(
-        &self,
-        request: GenerateRequest,
-    ) -> Result<GenerateStreamResponse, InferError> {
-        // Limit concurrent requests by acquiring a permit from the semaphore
-        let permit = self
-            .clone()
-            .limit_concurrent_requests
-            .try_acquire_owned()
-            .map_err(|err| {
-                metrics::increment_counter!("tgi_request_failure", "err" => "overloaded");
-                tracing::error!("{err}");
-                err
-            })?;
-
-        // Validate request
-        let valid_request = self.validation.validate(request).await.map_err(|err| {
-            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
-            tracing::error!("{err}");
-            err
-        })?;
-
-        // MPSC channel to communicate with the background batching task
-        let (response_tx, response_rx) = mpsc::unbounded_channel();
-        let input_length = valid_request.input_length;
-
-        // Append the request to the queue
-        self.queue.append(Entry {
-            request: valid_request,
-            response_tx,
-            span: Span::current(),
-            temp_span: None,
-            queue_time: Instant::now(),
-            batch_time: None,
-        });
-
-        // Notify the background task that we have a new entry in the queue that needs
-        // to be batched
-        self.shared.batching_task.notify_one();
-
-        // Return stream
-        Ok((
-            permit,
-            input_length,
-            UnboundedReceiverStream::new(response_rx),
-        ))
-    }
-
-    /// Tokenizer the input
-    #[instrument(skip_all)]
-    pub(crate) async fn tokenize(
-        &self,
-        request: GenerateRequest,
-    ) -> Result<Option<tokenizers::Encoding>, InferError> {
-        // Tokenize request
-        let inputs = request.inputs;
-        let truncate = request.parameters.truncate;
-        let encoding = self
-            .validation
-            .tokenize(inputs, truncate)
-            .await
-            .map_err(|err| {
-                tracing::error!("Tokenization {err}");
-                err
-            })?;
-
-        // Return Encoding
-        Ok(encoding.map(|(encoding, _)| encoding))
-    }
-
-    /// Apply the chat template to the chat request
-    #[instrument(skip_all)]
-    pub(crate) fn apply_chat_template(
-        &self,
-        messages: Vec<Message>,
-        grammar_with_prompt: Option<(GrammarType, String)>,
-    ) -> Result<String, InferError> {
-        self.chat_template
-            .as_ref()
-            .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
-            .apply(messages, grammar_with_prompt)
-            .map_err(|e| {
-                metrics::increment_counter!("tgi_request_failure", "err" => "template");
-                tracing::error!("{e}");
-                e
-            })
-    }
-
-    /// Add a new request to the queue and return a InferResponse
-    #[instrument(skip_all)]
-    pub(crate) async fn generate(
-        &self,
-        request: GenerateRequest,
-    ) -> Result<InferResponse, InferError> {
-        let use_top_tokens = request.parameters.top_n_tokens.is_some_and(|x| x > 0);
-
-        // Create stream and keep semaphore permit as long as generate lives
-        let (_permit, _input_length, mut stream) = self.generate_stream(request).await?;
-
-        // Return values
-        let mut result_prefill = Vec::new();
-        let mut result_tokens = Vec::new();
-        let mut result_top_tokens = Vec::new();
-        let mut result_generated_text = None;
-        let mut result_start = None;
-        let mut result_queued = None;
-
-        // Iterate on stream
-        while let Some(response) = stream.next().await {
-            match response? {
-                // Add prefill tokens
-                InferStreamResponse::Prefill(tokens) => {
-                    // Create Token objects
-                    // We do that here instead of in the Python code as Rust for loops are faster
-                    result_prefill = tokens
-                        .ids
-                        .into_iter()
-                        .zip(tokens.logprobs.into_iter())
-                        .zip(tokens.texts.into_iter())
-                        .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
-                        .collect();
-                }
-                // Push last token
-                InferStreamResponse::Intermediate { token, top_tokens } => {
-                    result_tokens.push(token);
-                    result_top_tokens.push(top_tokens);
-                }
-                // Final message
-                // Set return values
-                InferStreamResponse::End {
-                    token,
-                    generated_text,
-                    start,
-                    queued,
-                    top_tokens,
-                } => {
-                    result_tokens.push(token);
-                    result_top_tokens.push(top_tokens);
-                    result_generated_text = Some(generated_text);
-                    result_start = Some(start);
-                    result_queued = Some(queued)
-                }
-            }
-        }
-
-        // Check that we received a `InferStreamResponse::End` message
-        if let (Some(generated_text), Some(queued), Some(start)) =
-            (result_generated_text, result_queued, result_start)
-        {
-            Ok(InferResponse {
-                prefill: result_prefill,
-                _input_length,
-                tokens: result_tokens,
-                generated_text,
-                queued,
-                start,
-                top_tokens: if use_top_tokens {
-                    result_top_tokens
-                } else {
-                    Vec::new()
-                },
-            })
-        } else {
-            let err = InferError::IncompleteGeneration;
-            metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
-            tracing::error!("{err}");
-            Err(err)
-        }
-    }
-    /// Add best_of new requests to the queue and return a InferResponse of the sequence with
-    /// the highest log probability per token
-    #[instrument(skip(self, request))]
-    pub(crate) async fn generate_best_of(
-        &self,
-        request: GenerateRequest,
-        best_of: usize,
-    ) -> Result<(InferResponse, Vec<InferResponse>), InferError> {
-        // validate  best_of parameter separately
-        let best_of = self.validation.validate_best_of(best_of)?;
-
-        // create multiple generate requests
-        let mut infer_responses: Vec<InferResponse> =
-            try_join_all((0..best_of).map(|_| self.generate(request.clone()))).await?;
-
-        // get the sequence with the highest log probability per token
-        let mut max_index = 0;
-        let mut max_logprob: f32 = f32::MIN;
-
-        for (i, response) in infer_responses.iter().enumerate() {
-            // mean logprobs of the generated tokens
-            let sequence_logprob = response
-                .tokens
-                .iter()
-                .map(|token| token.logprob)
-                .sum::<f32>()
-                / response.tokens.len() as f32;
-
-            // set best sequence
-            if sequence_logprob > max_logprob {
-                max_index = i;
-                max_logprob = sequence_logprob;
-            }
-        }
-        let best_response = infer_responses.remove(max_index);
-        Ok((best_response, infer_responses))
-    }
-}
-
-#[derive(Clone)]
-struct ChatTemplate {
-    template: Template<'static, 'static>,
-    bos_token: Option<String>,
-    eos_token: Option<String>,
-    use_default_tool_template: bool,
-}
-
-impl ChatTemplate {
-    fn new(template: String, bos_token: Option<String>, eos_token: Option<String>) -> Self {
-        let mut env = Box::new(Environment::new());
-        let template_str = template.into_boxed_str();
-        env.add_function("raise_exception", raise_exception);
-
-        // check if contains the tools variable within the template
-        let use_default_tool_template =
-            !template_str.as_ref().replace(' ', "").contains("{{tools}}");
-        // leaking env and template_str as read-only, static resources for performance.
-        let template = Box::leak(env)
-            .template_from_str(Box::leak(template_str))
-            .unwrap();
-
-        Self {
-            template,
-            bos_token,
-            eos_token,
-            use_default_tool_template,
-        }
-    }
-
-    fn apply(
-        &self,
-        mut messages: Vec<Message>,
-        grammar_with_prompt: Option<(GrammarType, String)>,
-    ) -> Result<String, InferError> {
-        if self.use_default_tool_template {
-            if let Some(last_message) = messages.last_mut() {
-                if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
-                    last_message.content = Some(format!(
-                        "{}\n---\n{}\n{}",
-                        last_message.content.as_deref().unwrap_or_default(),
-                        tool_prompt,
-                        tools
-                    ));
-                }
-            }
-        }
-
-        self.template
-            .render(ChatTemplateInputs {
-                messages,
-                bos_token: self.bos_token.as_deref(),
-                eos_token: self.eos_token.as_deref(),
-                add_generation_prompt: true,
-                tools: None,
-                tools_prompt: None,
-            })
-            .map_err(InferError::TemplateError)
-    }
-}
-
-pub struct ToolGrammar {}
-
-impl ToolGrammar {
-    pub fn apply(
-        tools: Option<Vec<Tool>>,
-        tool_choice: Option<ToolType>,
-    ) -> Result<Option<Tools>, InferError> {
-        if let Some((req_tools, tool_choice)) = tools.zip(tool_choice) {
-            // let tool_prompt = tool_prompt.unwrap_or_default();
-            let tools_to_use = match tool_choice {
-                ToolType::FunctionName(name) => {
-                    vec![req_tools
-                        .iter()
-                        .find(|tool| tool.function.name == *name)
-                        .unwrap_or_else(|| panic!("Tool with name {} not found", name))
-                        .clone()]
-                }
-                ToolType::OneOf => req_tools.to_owned(),
-            };
-
-            // adds the error notification function for LLM feedback if required
-            let mut text_response_properties = Map::new();
-            text_response_properties.insert(
-                "error".to_string(),
-                serde_json::json!({
-                    "type": "string",
-                    "description": "The error or issue to notify"
-                }),
-            );
-            text_response_properties.insert(
-                "_name".to_string(),
-                serde_json::json!({
-                    "type": "string",
-                    "const": "notify_error"
-                }),
-            );
-
-            let functions: HashMap<String, serde_json::Value> = tools_to_use
-                .iter()
-                .map(|tool| {
-                    let func = tool.function.clone();
-
-                    // Clone the existing parameters, which are expected to be a JSON object
-                    let mut params = if let Value::Object(params) = &func.arguments {
-                        params.clone()
-                    } else {
-                        Map::new()
-                    };
-
-                    // Insert the function's description at the top level, outside of properties
-                    params.insert(
-                        "description".to_string(),
-                        Value::String(func.description.clone().unwrap_or_default()),
-                    );
-
-                    // Ensure 'properties' exists and is an object
-                    let properties = params
-                        .entry("properties".to_string())
-                        .or_insert_with(|| json!({}))
-                        .as_object_mut()
-                        .unwrap();
-
-                    // Insert the constant for the function name inside 'properties'
-                    properties.insert(
-                        "_name".to_string(),
-                        json!({
-                            "type": "string",
-                            "const": func.name.clone(),
-                            // "description": "The name of the function"
-                        }),
-                    );
-
-                    // Check if 'required' exists, and it is an array. If not, create an empty array.
-                    let required = params
-                        .entry("required".to_string())
-                        .or_insert_with(|| json!([]))
-                        .as_array_mut()
-                        .unwrap();
-
-                    // Add 'name' to the 'required' array if it is not already present
-                    if !required.iter().any(|r| r == "_name") {
-                        required.push(json!("_name"));
-                    }
-
-                    (func.name, Value::Object(params))
-                })
-                .chain([(
-                    "notify_error".to_string(),
-                    serde_json::json!({
-                        "properties": text_response_properties,
-                        "required": ["error", "_name"],
-                        "type": "object"
-                    }),
-                )])
-                .collect();
-
-            let tools = Tools {
-                functions_map: FunctionsMap { functions },
-                properties: Properties {
-                    function: tools_to_use
-                        .iter()
-                        .map(|tool| FunctionRef {
-                            ref_path: format!("#/$functions/{}", tool.function.name.clone()),
-                        })
-                        .chain(std::iter::once(FunctionRef {
-                            ref_path: "#/$functions/notify_error".to_string(),
-                        }))
-                        .collect(),
-                },
-            };
-
-            return Ok(Some(tools));
-        }
-        // Err(InferError::ToolError("No tools provided".to_string()))
-        Ok(None)
-    }
-}
-
-/// Batching logic
-/// Will be launched in a background Tokio task
-///
-/// Batches requests and sends them to the inference server
-#[allow(clippy::too_many_arguments)]
-async fn batching_task(
-    mut client: ShardedClient,
-    waiting_served_ratio: f32,
-    max_batch_prefill_tokens: u32,
-    max_batch_total_tokens: u32,
-    max_waiting_tokens: usize,
-    max_batch_size: Option<usize>,
-    queue: Queue,
-    shared: Arc<Shared>,
-    generation_health: Arc<AtomicBool>,
-) {
-    // Infinite loop
-    loop {
-        // Wait for a notification from the Infer struct
-        shared.batching_task.notified().await;
-
-        // Get the next batch from the queue
-        // This batch might be smaller than the maximum batch size if there are not enough requests
-        // waiting in the queue
-        while let Some((mut entries, batch, span)) = queue
-            .next_batch(
-                None,
-                max_batch_size,
-                max_batch_prefill_tokens,
-                max_batch_total_tokens,
-            )
-            .await
-        {
-            let mut cached_batch = prefill(&mut client, batch, &mut entries, &generation_health)
-                .instrument(span)
-                .await;
-            let mut waiting_tokens = 1;
-
-            // We loop until we do not receive any cached batch from the inference server (== until
-            // all requests have met their stopping criteria)
-            while let Some(batch) = cached_batch {
-                // Get current batch info
-                let batch_size = batch.size;
-                let batch_max_tokens = batch.max_tokens;
-                let mut batches = vec![batch];
-                metrics::gauge!("tgi_batch_current_size", batch_size as f64);
-                metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64);
-
-                let min_size = if waiting_tokens >= max_waiting_tokens {
-                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
-                    // to add a new batch even though its size might be small
-                    None
-                } else {
-                    // Minimum batch size
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
-                };
-
-                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
-                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
-
-                // Try to get a new batch
-                if let Some((mut new_entries, new_batch, span)) = queue
-                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
-                    .await
-                {
-                    // Tracking metrics
-                    if min_size.is_some() {
-                        metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure");
-                    } else {
-                        metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded");
-                    }
-
-                    entries.iter_mut().for_each(|(_, entry)| {
-                        // Create a new span to add the info that this entry is waiting
-                        // because a new batch is being computed
-                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
-                        // Add relationships
-                        span.follows_from(&entry_waiting_span);
-                        entry_waiting_span.follows_from(&span);
-                        // Update entry
-                        entry.temp_span = Some(entry_waiting_span);
-                    });
-
-                    // Generate one token for this new batch to have the attention past in cache
-                    let new_cached_batch =
-                        prefill(&mut client, new_batch, &mut new_entries, &generation_health)
-                            .instrument(span)
-                            .await;
-                    // Reset waiting counter
-                    waiting_tokens = 1;
-                    // Extend current batch with the new batch
-                    if let Some(new_cached_batch) = new_cached_batch {
-                        entries.extend(new_entries);
-                        batches.push(new_cached_batch);
-                    }
-                }
-
-                // Create span for this batch to add context to inference calls
-                let next_batch_size = entries.len();
-                let next_batch_span =
-                    info_span!(parent: None, "batch", batch_size = next_batch_size);
-                entries.iter_mut().for_each(|(_, entry)| {
-                    // Create a new span to link the batch back to this entry
-                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
-                    // Add relationships
-                    next_batch_span.follows_from(&entry_batch_span);
-                    entry_batch_span.follows_from(&next_batch_span);
-                    // Update entry
-                    entry.temp_span = Some(entry_batch_span);
-                });
-
-                cached_batch = decode(&mut client, batches, &mut entries, &generation_health)
-                    .instrument(next_batch_span)
-                    .await;
-                waiting_tokens += 1;
-            }
-            metrics::gauge!("tgi_batch_current_size", 0.0);
-            metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn prefill(
-    client: &mut ShardedClient,
-    batch: Batch,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_id = batch.id;
-    metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill");
-
-    match client.prefill(batch).await {
-        Ok((generations, next_batch, timings)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-
-            let start_filtering_time = Instant::now();
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "prefill");
-            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "prefill");
-            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "prefill");
-            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill");
-            metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            // Update health
-            generation_health.store(false, Ordering::SeqCst);
-            let _ = client.clear_cache(Some(batch_id)).await;
-            send_errors(err, entries);
-            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
-            None
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn decode(
-    client: &mut ShardedClient,
-    batches: Vec<CachedBatch>,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
-    metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode");
-
-    match client.decode(batches).await {
-        Ok((generations, next_batch, timings)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-
-            let start_filtering_time = Instant::now();
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            if let Some(concat_duration) = timings.concat {
-                metrics::histogram!("tgi_batch_concat_duration", concat_duration.as_secs_f64(), "method" => "decode");
-            }
-            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "decode");
-            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "decode");
-            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "decode");
-            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode");
-            metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            generation_health.store(false, Ordering::SeqCst);
-            for id in batch_ids {
-                let _ = client.clear_cache(Some(id)).await;
-            }
-            send_errors(err, entries);
-            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode");
-            None
-        }
-    }
-}
-
-/// Filter a `batch` and remove all requests not present in `entries`
-#[instrument(skip_all)]
-async fn filter_batch(
-    client: &mut ShardedClient,
-    next_batch: Option<CachedBatch>,
-    entries: &IntMap<u64, Entry>,
-) -> Option<CachedBatch> {
-    let mut batch = next_batch?;
-
-    // No need to filter
-    if batch.size as usize == entries.len() {
-        return Some(batch);
-    }
-
-    let id = batch.id;
-
-    // Retain only requests that are still in entries
-    batch.request_ids.retain(|id| entries.contains_key(id));
-
-    if batch.request_ids.is_empty() {
-        // All requests have been filtered out
-        // Next batch is now empty
-        // Clear it from the Python shards cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.clear_cache(Some(id)).await.unwrap();
-        None
-    } else {
-        // Filter Python shard cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.filter_batch(id, batch.request_ids).await.unwrap()
-    }
-}
-
-/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
-/// and filter entries
-#[instrument(skip_all)]
-fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
-    generations.into_iter().for_each(|generation| {
-        let id = generation.request_id;
-        // Get entry
-        // We can `expect` here as the request id should always be in the entries
-        let entry = entries
-            .get(&id)
-            .expect("ID not found in entries. This is a bug.");
-
-        // Create and enter a span to link this function back to the entry
-        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
-        // Send generation responses back to the infer task
-        // If the receive an error from the Flume channel, it means that the client dropped the
-        // request and we need to stop generating hence why we unwrap_or(true)
-        let stopped = send_responses(generation, entry).map_err(|err| {
-            tracing::error!("Entry response channel error.");
-            metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
-            err
-        }).unwrap_or(true);
-        if stopped {
-            entries.remove(&id).expect("ID not found in entries. This is a bug.");
-        }
-    });
-}
-
-/// Send responses through the `entry` response channel
-fn send_responses(
-    generation: Generation,
-    entry: &Entry,
-) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
-    // Return directly if the channel is disconnected
-    if entry.response_tx.is_closed() {
-        metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
-        return Ok(true);
-    }
-
-    let mut stopped = false;
-
-    if let Some(prefill_tokens) = generation.prefill_tokens {
-        // Send message
-        entry
-            .response_tx
-            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
-    }
-
-    // Create last Token
-    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
-    let n = tokens_.ids.len();
-    metrics::histogram!("tgi_request_skipped_tokens", (n - 1) as f64);
-    let mut iterator = tokens_
-        .ids
-        .into_iter()
-        .zip(tokens_.logprobs)
-        .zip(tokens_.texts)
-        .zip(tokens_.is_special)
-        .enumerate()
-        .peekable();
-    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
-        let token = Token {
-            id,
-            text,
-            logprob,
-            special,
-        };
-        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
-            top_tokens_
-                .ids
-                .iter()
-                .zip(top_tokens_.logprobs.iter())
-                .zip(top_tokens_.texts.iter())
-                .zip(top_tokens_.is_special.iter())
-                .map(|(((&id, &logprob), text), &special)| Token {
-                    id,
-                    text: text.to_string(),
-                    logprob,
-                    special,
-                })
-                .collect()
-        } else {
-            vec![]
-        };
-        match (&generation.generated_text, iterator.peek()) {
-            (Some(generated_text), None) => {
-                // Generation has ended
-                stopped = true;
-                // Send message
-                entry.response_tx.send(Ok(InferStreamResponse::End {
-                    token,
-                    top_tokens,
-                    generated_text: generated_text.clone(),
-                    queued: entry.queue_time,
-                    start: entry.batch_time.unwrap(),
-                }))?;
-            }
-            _ => {
-                // Send message
-                entry
-                    .response_tx
-                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
-            }
-        }
-    }
-
-    Ok(stopped)
-}
-
-/// Send errors to Infer for all `entries`
-#[instrument(skip_all)]
-fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
-    entries.drain().for_each(|(_, entry)| {
-        // Create and enter a span to link this function back to the entry
-        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
-        let err = InferError::GenerationError(error.to_string());
-        metrics::increment_counter!("tgi_request_failure", "err" => "generation");
-        tracing::error!("{err}");
-
-        // unwrap_or is valid here as we don't care if the receiver is gone.
-        entry
-            .response_tx
-            .send(Err(err))
-            .unwrap_or(());
-    });
-}
-
-#[derive(Debug)]
-pub(crate) enum InferStreamResponse {
-    // Optional first message
-    Prefill(Tokens),
-    // Intermediate messages
-    Intermediate {
-        token: Token,
-        top_tokens: Vec<Token>,
-    },
-    // Last message
-    End {
-        token: Token,
-        top_tokens: Vec<Token>,
-        generated_text: GeneratedText,
-        start: Instant,
-        queued: Instant,
-    },
-}
-
-#[derive(Debug)]
-pub(crate) struct InferResponse {
-    /// input_length is the input as perceived by the rust tokenizer in the
-    /// validation pathway. It is redundant with prefill.len() but prefill
-    /// has data only if the user asked for it. This will always be filled.
-    pub(crate) _input_length: u32,
-    pub(crate) prefill: Vec<PrefillToken>,
-    pub(crate) tokens: Vec<Token>,
-    pub(crate) generated_text: GeneratedText,
-    pub(crate) queued: Instant,
-    pub(crate) start: Instant,
-    pub(crate) top_tokens: Vec<Vec<Token>>,
-}
-
-#[derive(Debug, Error)]
-pub enum InferError {
-    #[error("Request failed during generation: {0}")]
-    GenerationError(String),
-    #[error("Model is overloaded")]
-    Overloaded(#[from] TryAcquireError),
-    #[error("Input validation error: {0}")]
-    ValidationError(#[from] ValidationError),
-    #[error("Incomplete generation")]
-    IncompleteGeneration,
-    #[error("Template error: {0}")]
-    TemplateError(#[from] minijinja::Error),
-    #[error("Tool error: {0}")]
-    ToolError(String),
-}
-
-impl InferError {
-    pub(crate) fn error_type(&self) -> &str {
-        match self {
-            InferError::GenerationError(_) => "generation",
-            InferError::Overloaded(_) => "overloaded",
-            InferError::ValidationError(_) => "validation",
-            InferError::IncompleteGeneration => "incomplete_generation",
-            InferError::TemplateError(_) => "template_error",
-            InferError::ToolError(_) => "tool_error",
-        }
-    }
-}
-
-// tests
-#[cfg(test)]
-mod tests {
-    use crate::infer::raise_exception;
-    use crate::ChatTemplateInputs;
-    use crate::Message;
-    use minijinja::Environment;
-
-    #[test]
-    fn test_chat_template() {
-        let env = Environment::new();
-
-        let source = r#"
-        {% for message in messages %}
-            {% if message['role'] == 'system' %}
-                {% if message['content']%}
-                    {{'### System:\n' + message['content']+'\n\n'}}
-                {% endif %}
-            {% elif message['role'] == 'user' %}
-                {{'### User:\n' + message['content']+'\n\n'}}
-            {% elif message['role'] == 'assistant' %}
-                {{'### Assistant:\n'  + message['content']}}
-            {% endif %}
-            {% if loop.last and add_generation_prompt %}
-                {{ '### Assistant:\n' }}
-            {% endif %}
-        {% endfor %}"#;
-
-        // trim all the whitespace
-        let source = source
-            .lines()
-            .map(|line| line.trim())
-            .collect::<Vec<&str>>()
-            .join("");
-
-        let tmpl = env.template_from_str(&source);
-
-        let chat_template_inputs = ChatTemplateInputs {
-            messages: vec![
-                Message {
-                    role: "user".to_string(),
-                    content: Some("Hi!".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "assistant".to_string(),
-                    content: Some("Hello how can I help?".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "user".to_string(),
-                    content: Some("What is Deep Learning?".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "assistant".to_string(),
-                    content: Some("magic!".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-            ],
-            bos_token: Some("[BOS]"),
-            eos_token: Some("[EOS]"),
-            add_generation_prompt: true,
-            ..Default::default()
-        };
-
-        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
-
-        assert_eq!(
-            result,
-            "### User:\nHi!\n\n### Assistant:\nHello how can I help?### User:\nWhat is Deep Learning?\n\n### Assistant:\nmagic!### Assistant:\n"
-        );
-    }
-
-    #[test]
-    fn test_chat_template_invalid_with_raise() {
-        let mut env = Environment::new();
-        env.add_function("raise_exception", raise_exception);
-
-        let source = r#"
-        {{ bos_token }}
-        {% for message in messages %}
-        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
-        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
-        {% endif %}
-        {% if message['role'] == 'user' %}
-        {{ '[INST] ' + message['content'] + ' [/INST]' }}
-        {% elif message['role'] == 'assistant' %}
-        {{ message['content'] + eos_token}}
-        {% else %}
-        {{ raise_exception('Only user and assistant roles are supported!') }}
-        {% endif %}
-        {% endfor %}"#;
-
-        // trim all the whitespace
-        let source = source
-            .lines()
-            .map(|line| line.trim())
-            .collect::<Vec<&str>>()
-            .join("");
-
-        let tmpl = env.template_from_str(&source);
-
-        let chat_template_inputs = ChatTemplateInputs {
-            messages: vec![
-                Message {
-                    role: "user".to_string(),
-                    content: Some("Hi!".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "user".to_string(),
-                    content: Some("Hi again!".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "assistant".to_string(),
-                    content: Some("Hello how can I help?".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "user".to_string(),
-                    content: Some("What is Deep Learning?".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "assistant".to_string(),
-                    content: Some("magic!".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-            ],
-            bos_token: Some("[BOS]"),
-            eos_token: Some("[EOS]"),
-            add_generation_prompt: true,
-            ..Default::default()
-        };
-
-        let result = tmpl.unwrap().render(chat_template_inputs); //.err().unwrap();
-
-        match result {
-            Ok(_) => panic!("Should have failed"),
-            Err(e) => {
-                assert_eq!(
-                    e.detail().unwrap(),
-                    "Conversation roles must alternate user/assistant/user/assistant/..."
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn test_chat_template_valid_with_raise() {
-        let mut env = Environment::new();
-        env.add_function("raise_exception", raise_exception);
-
-        let source = r#"
-        {{ bos_token }}
-        {% for message in messages %}
-        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
-        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
-        {% endif %}
-        {% if message['role'] == 'user' %}
-        {{ '[INST] ' + message['content'] + ' [/INST]' }}
-        {% elif message['role'] == 'assistant' %}
-        {{ message['content'] + eos_token}}
-        {% else %}
-        {{ raise_exception('Only user and assistant roles are supported!') }}
-        {% endif %}
-        {% endfor %}"#;
-
-        // trim all the whitespace
-        let source = source
-            .lines()
-            .map(|line| line.trim())
-            .collect::<Vec<&str>>()
-            .join("");
-
-        let tmpl = env.template_from_str(&source);
-
-        let chat_template_inputs = ChatTemplateInputs {
-            messages: vec![
-                Message {
-                    role: "user".to_string(),
-                    content: Some("Hi!".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "assistant".to_string(),
-                    content: Some("Hello how can I help?".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "user".to_string(),
-                    content: Some("What is Deep Learning?".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "assistant".to_string(),
-                    content: Some("magic!".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-            ],
-            bos_token: Some("[BOS]"),
-            eos_token: Some("[EOS]"),
-            add_generation_prompt: true,
-            ..Default::default()
-        };
-
-        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
-        assert_eq!(result, "[BOS][INST] Hi! [/INST]Hello how can I help?[EOS][INST] What is Deep Learning? [/INST]magic![EOS]");
-    }
-
-    #[test]
-    fn test_chat_template_valid_with_add_generation_prompt() {
-        let mut env = Environment::new();
-        env.add_function("raise_exception", raise_exception);
-
-        let source = r#"
-        {% for message in messages %}
-        {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
-        {% endfor %}
-        {% if add_generation_prompt %}
-            {{ '<|im_start|>assistant\n' }}
-        {% endif %}"#;
-
-        // trim all the whitespace
-        let source = source
-            .lines()
-            .map(|line| line.trim())
-            .collect::<Vec<&str>>()
-            .join("");
-
-        let tmpl = env.template_from_str(&source);
-
-        let chat_template_inputs = ChatTemplateInputs {
-            messages: vec![
-                Message {
-                    role: "user".to_string(),
-                    content: Some("Hi!".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "assistant".to_string(),
-                    content: Some("Hello how can I help?".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "user".to_string(),
-                    content: Some("What is Deep Learning?".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-                Message {
-                    role: "assistant".to_string(),
-                    content: Some("magic!".to_string()),
-                    name: None,
-                    tool_calls: None,
-                },
-            ],
-            bos_token: Some("[BOS]"),
-            eos_token: Some("[EOS]"),
-            add_generation_prompt: true,
-            ..Default::default()
-        };
-
-        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
-        assert_eq!(result, "<|im_start|>user\nHi!<|im_end|>\n<|im_start|>assistant\nHello how can I help?<|im_end|>\n<|im_start|>user\nWhat is Deep Learning?<|im_end|>\n<|im_start|>assistant\nmagic!<|im_end|>\n<|im_start|>assistant\n");
-    }
-
-    struct ChatTemplateTestItem {
-        name: &'static str,
-        chat_template: &'static str,
-        input: ChatTemplateInputs<'static>,
-        target: &'static str,
-    }
-
-    #[test]
-    fn test_many_chat_templates() {
-        let example_chat = vec![
-            Message {
-                role: "user".to_string(),
-                content: Some("Hello, how are you?".to_string()),
-                name: None,
-                tool_calls: None,
-            },
-            Message {
-                role: "assistant".to_string(),
-                content: Some("I'm doing great. How can I help you today?".to_string()),
-                name: None,
-                tool_calls: None,
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some("I'd like to show off how chat templating works!".to_string()),
-                name: None,
-                tool_calls: None,
-            },
-        ];
-
-        let example_chat_with_system = vec![Message {
-            role: "system".to_string(),
-            content: Some(
-                "You are a friendly chatbot who always responds in the style of a pirate"
-                    .to_string(),
-            ),
-            name: None,
-            tool_calls: None,
-        }]
-        .iter()
-        .chain(&example_chat)
-        .cloned()
-        .collect::<Vec<_>>();
-
-        let test_default_templates = vec![
-            ChatTemplateTestItem {
-                name: "_base",
-                chat_template: "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some(""),
-                    ..Default::default()
-                },
-                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
-            },
-            ChatTemplateTestItem {
-                name: "blenderbot",
-                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "blenderbot_small",
-                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "bloom",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "gpt_neox",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("<|endoftext|>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
-            },
-            ChatTemplateTestItem {
-                name: "gpt2",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("<|endoftext|>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
-            },
-            ChatTemplateTestItem {
-                name: "llama",
-                // NOTE: the `.strip()` has been replaced with `| trim` in the following template
-                chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token +'[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content | trim + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat_with_system.clone(),
-                    add_generation_prompt: true,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>[INST] <<SYS>>\nYou are a friendly chatbot who always responds in the style of a pirate\n<</SYS>>\n\nHello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
-            },
-            ChatTemplateTestItem {
-                name: "whisper",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: true,
-                    bos_token: Some(""),
-                    eos_token: Some("<|endoftext|>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
-            },
-        ];
-
-        #[allow(unused_variables)] // name is unused
-        for ChatTemplateTestItem {
-            name,
-            chat_template,
-            input,
-            target,
-        } in test_default_templates
-        {
-            let mut env = Environment::new();
-            env.add_function("raise_exception", raise_exception);
-            let tmpl = env.template_from_str(&chat_template);
-            let result = tmpl.unwrap().render(input).unwrap();
-            assert_eq!(result, target);
-        }
-
-        let test_custom_templates = vec![
-            ChatTemplateTestItem {
-                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=false)",
-                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat_with_system.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHello, how are you?</s><|assistant|>\nI'm doing great. How can I help you today?</s><|user|>\nI'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=true)",
-                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: vec![
-                        Message {
-                            role: "system".to_string(),
-                            content: Some("You are a friendly chatbot who always responds in the style of a pirate".to_string()),
-                            name: None,
-                            tool_calls: None,
-                        },
-                        Message {
-                            role: "user".to_string(),
-                            content: Some("How many helicopters can a human eat in one sitting?".to_string()),
-                            name: None,
-                            tool_calls: None,
-                        },
-                    ],
-                    add_generation_prompt: true,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHow many helicopters can a human eat in one sitting?</s><|assistant|>",
-            },
-            ChatTemplateTestItem {
-                name: "HuggingFaceH4/zephyr-7b-gemma-v0.1",
-                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<bos>"),
-                    eos_token: Some("<eos>"),
-                    ..Default::default()
-                },
-                target: "<bos><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
-            },
-            ChatTemplateTestItem {
-                name: "mistralai/Mistral-7B-Instruct-v0.1",
-                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]",
-            },
-            ChatTemplateTestItem {
-                name: "mistralai/Mixtral-8x7B-Instruct-v0.1",
-                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s>[INST] I'd like to show off how chat templating works! [/INST]",
-            },
-            ChatTemplateTestItem {
-                name: "cognitivecomputations/dolphin-2.5-mixtral-8x7b",
-                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
-            },
-            ChatTemplateTestItem {
-                name: "openchat/openchat-3.5-0106",
-                // `.title()` has been replaced with `| upper` in the following template
-                chat_template: "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + (message['role'] | title) + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>",
-            },
-            ChatTemplateTestItem {
-                name: "upstage/SOLAR-10.7B-Instruct-v1.0",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "codellama/CodeLlama-70b-Instruct-hf",
-                // NOTE: `.strip()` has been replaced with `| trim` in the following template
-                chat_template: "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\\n\\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\\nDestination: user\\n\\n '}}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>Source: user\n\n Hello, how are you? <step> Source: assistant\n\n I'm doing great. How can I help you today? <step> Source: user\n\n I'd like to show off how chat templating works! <step> Source: assistant\nDestination: user\n\n ",
-            },
-            ChatTemplateTestItem {
-                name: "Deci/DeciLM-7B-instruct",
-                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\\n'  + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "### User:\nHello, how are you?### Assistant:\nI'm doing great. How can I help you today?### User:\nI'd like to show off how chat templating works!",
-            },
-            ChatTemplateTestItem {
-                name: "Qwen/Qwen1.5-72B-Chat",
-                chat_template: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!",
-            },
-            ChatTemplateTestItem {
-                name: "deepseek-ai/deepseek-llm-7b-chat",
-                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\\n\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<｜begin▁of▁sentence｜>"),
-                    eos_token: Some("<｜end▁of▁sentence｜>"),
-                    ..Default::default()
-                },
-                target: "<｜begin▁of▁sentence｜>User: Hello, how are you?\n\nAssistant: I'm doing great. How can I help you today?<｜end▁of▁sentence｜>User: I'd like to show off how chat templating works!\n\n",
-            },
-            ChatTemplateTestItem {
-                name: "h2oai/h2o-danube-1.8b-chat",
-                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|prompt|>Hello, how are you?</s><|answer|>I'm doing great. How can I help you today?</s><|prompt|>I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "internlm/internlm2-chat-7b",
-                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
-            },
-            ChatTemplateTestItem {
-                name: "TheBloke/deepseek-coder-33B-instruct-AWQ",
-                chat_template: "{%- set found_item = false -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set found_item = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<｜begin▁of▁sentence｜>"),
-                    eos_token: Some("<|EOT|>"),
-                    ..Default::default()
-                },
-                target: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n### Response:\n",
-            },
-            ChatTemplateTestItem {
-                name: "ericzzz/falcon-rw-1b-chat",
-                // `.strip()` has been replaced with `| trim` in the following template
-                chat_template: "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'] | trim }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] | trim }}{% elif message['role'] == 'assistant' %}{{ '[RESP] '  + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<|endoftext|>"),
-                    eos_token: Some("<|endoftext|>"),
-                    ..Default::default()
-                },
-                target: "[INST] Hello, how are you? [RESP] I'm doing great. How can I help you today?<|endoftext|>[INST] I'd like to show off how chat templating works!",
-            },
-            ChatTemplateTestItem {
-                name: "abacusai/Smaug-34B-v0.1",
-                chat_template: "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
-            },
-            ChatTemplateTestItem {
-                name: "maywell/Synatra-Mixtral-8x7B",
-                chat_template: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "Below is an instruction that describes a task. Write a response that appropriately completes the request.### Instruction:Hello, how are you?### Response:I'm doing great. How can I help you today?### Instruction:I'd like to show off how chat templating works!",
-            },
-            ChatTemplateTestItem {
-                name: "deepseek-ai/deepseek-coder-33b-instruct",
-                chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<｜begin▁of▁sentence｜>"),
-                    eos_token: Some("</EOT>"),
-                    ..Default::default()
-                },
-                target: "<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n",
-            },
-            // NOT INCLUDED
-            // - meetkai/functionary-medium-v2.2
-            // - fireworks-ai/firefunction-v1
-            // https://github
-            ChatTemplateTestItem {
-                name: "maywell/PiVoT-MoE",
-                chat_template: "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content']|trim }}{% elif message['role'] == 'user' %}### Instruction: {{ message['content']|trim }}{% elif message['role'] == 'assistant' %}### Response: {{ message['content']|trim }}{% elif message['role'] == 'user_context' %}### Input: {{ message['content']|trim }}{% endif %}{% if not loop.last %}\n{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}### Response:{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat_with_system.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "You are a friendly chatbot who always responds in the style of a pirateYou are a friendly chatbot who always responds in the style of a pirate### Instruction: Hello, how are you?### Response: I'm doing great. How can I help you today?### Instruction: I'd like to show off how chat templating works!",
-            },
-        ];
-
-        #[allow(unused_variables)] // name is unused
-        for ChatTemplateTestItem {
-            name,
-            chat_template,
-            input,
-            target,
-        } in test_custom_templates
-        {
-            let mut env = Environment::new();
-            env.add_function("raise_exception", raise_exception);
-            // trim all the whitespace
-            let chat_template = chat_template
-                .lines()
-                .map(|line| line.trim())
-                .collect::<Vec<&str>>()
-                .join("");
-
-            let tmpl = env.template_from_str(&chat_template);
-            let result = tmpl.unwrap().render(input).unwrap();
-            assert_eq!(result, target);
-        }
-    }
-}
--- a/router/src/queue.rs
+++ b/router/src/queue.rs
-use crate::infer::InferError;
-use crate::infer::InferStreamResponse;
-use crate::validation::ValidGenerateRequest;
-use nohash_hasher::{BuildNoHashHasher, IntMap};
-use std::cmp::min;
-use std::collections::VecDeque;
-use text_generation_client::{Batch, Request};
-use tokio::sync::{mpsc, oneshot};
-use tokio::time::Instant;
-use tracing::{info_span, instrument, Span};
-
-/// Queue entry
-#[derive(Debug)]
-pub(crate) struct Entry {
-    /// Request
-    pub request: ValidGenerateRequest,
-    /// Response sender to communicate between the Infer struct and the batching_task
-    pub response_tx: mpsc::UnboundedSender<Result<InferStreamResponse, InferError>>,
-    /// Span that will live as long as entry
-    pub span: Span,
-    /// Temporary span used as a guard when logging inference, wait times...
-    pub temp_span: Option<Span>,
-    /// Instant when this entry was queued
-    pub queue_time: Instant,
-    /// Instant when this entry was added to a batch
-    pub batch_time: Option<Instant>,
-}
-
-/// Request Queue
-#[derive(Debug, Clone)]
-pub(crate) struct Queue {
-    /// Channel to communicate with the background queue task
-    queue_sender: mpsc::UnboundedSender<QueueCommand>,
-}
-
-impl Queue {
-    pub(crate) fn new(
-        requires_padding: bool,
-        block_size: u32,
-        window_size: Option<u32>,
-        speculate: u32,
-    ) -> Self {
-        // Create channel
-        let (queue_sender, queue_receiver) = mpsc::unbounded_channel();
-
-        // Launch background queue task
-        tokio::spawn(queue_task(
-            requires_padding,
-            block_size,
-            window_size,
-            speculate,
-            queue_receiver,
-        ));
-
-        Self { queue_sender }
-    }
-
-    /// Append an entry to the queue
-    #[instrument(skip_all)]
-    pub(crate) fn append(&self, entry: Entry) {
-        // Send append command to the background task managing the state
-        // Unwrap is safe here
-        self.queue_sender
-            .send(QueueCommand::Append(Box::new(entry), Span::current()))
-            .unwrap();
-    }
-
-    // Get the next batch
-    #[instrument(skip(self))]
-    pub(crate) async fn next_batch(
-        &self,
-        min_size: Option<usize>,
-        max_size: Option<usize>,
-        prefill_token_budget: u32,
-        token_budget: u32,
-    ) -> Option<NextBatch> {
-        // Create response channel
-        let (response_sender, response_receiver) = oneshot::channel();
-        // Send next batch command to the background task managing the state
-        // Unwrap is safe here
-        self.queue_sender
-            .send(QueueCommand::NextBatch {
-                min_size,
-                max_size,
-                prefill_token_budget,
-                token_budget,
-                response_sender,
-                span: Span::current(),
-            })
-            .unwrap();
-        // Await on response channel
-        // Unwrap is safe here
-        response_receiver.await.unwrap()
-    }
-}
-
-// Background task responsible of the queue state
-async fn queue_task(
-    requires_padding: bool,
-    block_size: u32,
-    window_size: Option<u32>,
-    speculate: u32,
-    mut receiver: mpsc::UnboundedReceiver<QueueCommand>,
-) {
-    let mut state = State::new(requires_padding, block_size, window_size, speculate);
-
-    while let Some(cmd) = receiver.recv().await {
-        match cmd {
-            QueueCommand::Append(entry, span) => {
-                span.in_scope(|| state.append(*entry));
-                metrics::increment_gauge!("tgi_queue_size", 1.0);
-            }
-            QueueCommand::NextBatch {
-                min_size,
-                max_size,
-                prefill_token_budget,
-                token_budget,
-                response_sender,
-                span,
-            } => span.in_scope(|| {
-                let next_batch =
-                    state.next_batch(min_size, max_size, prefill_token_budget, token_budget);
-                response_sender.send(next_batch).unwrap();
-                metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
-            }),
-        }
-    }
-}
-
-/// Queue State
-#[derive(Debug)]
-struct State {
-    /// Queue entries organized in a Vec
-    entries: VecDeque<(u64, Entry)>,
-
-    /// Id of the next entry
-    next_id: u64,
-
-    /// Id of the next batch
-    next_batch_id: u64,
-
-    /// Whether the model is using padding
-    requires_padding: bool,
-
-    /// Paged Attention block size
-    block_size: u32,
-
-    /// Sliding window
-    window_size: Option<u32>,
-
-    /// Speculation amount
-    speculate: u32,
-}
-
-impl State {
-    fn new(
-        requires_padding: bool,
-        block_size: u32,
-        window_size: Option<u32>,
-        speculate: u32,
-    ) -> Self {
-        Self {
-            entries: VecDeque::with_capacity(128),
-            next_id: 0,
-            next_batch_id: 0,
-            requires_padding,
-            block_size,
-            window_size,
-            speculate,
-        }
-    }
-
-    /// Append an entry to the queue
-    fn append(&mut self, mut entry: Entry) {
-        // Create a span that will live as long as the entry is in the queue waiting to be batched
-        let queue_span = info_span!(parent: &entry.span, "queued");
-        entry.temp_span = Some(queue_span);
-
-        // Push entry in the queue
-        self.entries.push_back((self.next_id, entry));
-        self.next_id += 1;
-    }
-
-    // Get the next batch
-    fn next_batch(
-        &mut self,
-        min_size: Option<usize>,
-        max_size: Option<usize>,
-        prefill_token_budget: u32,
-        token_budget: u32,
-    ) -> Option<NextBatch> {
-        if self.entries.is_empty() {
-            tracing::debug!("No queue");
-            return None;
-        }
-
-        // Check if we have enough entries
-        if let Some(min_size) = min_size {
-            if self.entries.len() < min_size {
-                tracing::debug!("Not enough entries");
-                return None;
-            }
-        }
-
-        // Pad prefill_token_budget to be a multiple of block size
-        let prefill_token_budget =
-            ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
-
-        // Create span for this batch to add context to inference calls
-        let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
-        next_batch_span.follows_from(&Span::current());
-
-        let mut batch_requests = Vec::with_capacity(self.entries.len());
-        let mut batch_entries =
-            IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default());
-
-        let mut max_input_length = 0;
-        let mut prefill_tokens: u32 = 0;
-        let mut decode_tokens: u32 = 0;
-
-        // Pop entries starting from the front of the queue
-        while let Some((id, mut entry)) = self.entries.pop_front() {
-            // Filter entries where the response receiver was dropped (== entries where the request
-            // was dropped by the client)
-            if entry.response_tx.is_closed() {
-                metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
-                tracing::debug!("Dropping entry");
-                continue;
-            }
-
-            if self.requires_padding {
-                // We pad to max input length in the Python shards
-                // We need to take these padding tokens into the equation
-                max_input_length = max_input_length.max(entry.request.input_length);
-                prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length
-            } else {
-                // pad to block size
-                prefill_tokens += ((entry.request.input_length + self.block_size - 1)
-                    / self.block_size)
-                    * self.block_size;
-            }
-
-            if self.requires_padding {
-                decode_tokens += entry.request.stopping_parameters.max_new_tokens;
-            } else {
-                let max_new_tokens = match self.window_size {
-                    None => entry.request.stopping_parameters.max_new_tokens,
-                    Some(window_size) => min(
-                        window_size.saturating_sub(entry.request.input_length),
-                        entry.request.stopping_parameters.max_new_tokens,
-                    ),
-                };
-
-                // pad to block size
-                decode_tokens +=
-                    ((max_new_tokens + self.block_size - 1) / self.block_size) * self.block_size;
-            }
-
-            if prefill_tokens > prefill_token_budget
-                || (prefill_tokens + decode_tokens + self.speculate) > token_budget
-            {
-                // Entry is over budget
-                // Add it back to the front
-                tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
-                self.entries.push_front((id, entry));
-                break;
-            }
-
-            tracing::debug!("Accepting entry");
-            // Create a new span to link the batch back to this entry
-            let entry_batch_span = info_span!(parent: &entry.span, "infer");
-            // Add relationships
-            next_batch_span.follows_from(&entry_batch_span);
-            entry_batch_span.follows_from(&next_batch_span);
-            // Update entry
-            entry.temp_span = Some(entry_batch_span);
-
-            batch_requests.push(Request {
-                id,
-                prefill_logprobs: entry.request.decoder_input_details,
-                inputs: entry.request.inputs.clone(),
-                truncate: entry.request.truncate,
-                parameters: Some(entry.request.parameters.clone()),
-                stopping_parameters: Some(entry.request.stopping_parameters.clone()),
-                top_n_tokens: entry.request.top_n_tokens,
-            });
-            // Set batch_time
-            entry.batch_time = Some(Instant::now());
-            // Insert in batch_entries IntMap
-            batch_entries.insert(id, entry);
-
-            // Check if max_size
-            if Some(batch_requests.len()) == max_size {
-                break;
-            }
-        }
-
-        // Empty batch
-        if batch_requests.is_empty() {
-            tracing::debug!("Filterered out all entries");
-            return None;
-        }
-
-        // Check if our batch is big enough
-        if let Some(min_size) = min_size {
-            // Batch is too small
-            if batch_requests.len() < min_size {
-                // Add back entries to the queue in the correct order
-                for r in batch_requests.into_iter().rev() {
-                    let id = r.id;
-                    let entry = batch_entries.remove(&id).unwrap();
-                    self.entries.push_front((id, entry));
-                }
-
-                return None;
-            }
-        }
-
-        // Final batch size
-        let size = batch_requests.len() as u32;
-        next_batch_span.record("batch_size", size);
-
-        let batch = Batch {
-            id: self.next_batch_id,
-            requests: batch_requests,
-            size,
-            max_tokens: (prefill_tokens + decode_tokens),
-        };
-        // Increment batch id
-        self.next_batch_id += 1;
-
-        metrics::histogram!("tgi_batch_next_size", batch.size as f64);
-
-        Some((batch_entries, batch, next_batch_span))
-    }
-}
-
-type NextBatch = (IntMap<u64, Entry>, Batch, Span);
-
-#[derive(Debug)]
-enum QueueCommand {
-    Append(Box<Entry>, Span),
-    NextBatch {
-        min_size: Option<usize>,
-        max_size: Option<usize>,
-        prefill_token_budget: u32,
-        token_budget: u32,
-        response_sender: oneshot::Sender<Option<NextBatch>>,
-        span: Span,
-    },
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use text_generation_client::{
-        GrammarType as ProtoGrammarType, NextTokenChooserParameters, StoppingCriteriaParameters,
-    };
-    use tracing::info_span;
-
-    fn default_entry() -> (
-        Entry,
-        mpsc::UnboundedReceiver<Result<InferStreamResponse, InferError>>,
-    ) {
-        let (response_tx, receiver_tx) = mpsc::unbounded_channel();
-
-        let entry = Entry {
-            request: ValidGenerateRequest {
-                inputs: String::new(),
-                input_length: 0,
-                truncate: 0,
-                decoder_input_details: false,
-                parameters: NextTokenChooserParameters {
-                    temperature: 0.0,
-                    top_k: 0,
-                    top_p: 0.0,
-                    typical_p: 0.0,
-                    do_sample: false,
-                    seed: 0,
-                    repetition_penalty: 0.0,
-                    frequency_penalty: 0.0,
-                    watermark: false,
-                    grammar: String::new(),
-                    grammar_type: ProtoGrammarType::None as i32,
-                },
-                stopping_parameters: StoppingCriteriaParameters {
-                    ignore_eos_token: false,
-                    max_new_tokens: 1,
-                    stop_sequences: vec![],
-                },
-                top_n_tokens: 0,
-            },
-            response_tx,
-            span: info_span!("entry"),
-            temp_span: None,
-            queue_time: Instant::now(),
-            batch_time: None,
-        };
-        (entry, receiver_tx)
-    }
-
-    #[test]
-    fn test_append() {
-        let mut state = State::new(false, 1, None, 0);
-        let (entry, _guard) = default_entry();
-
-        assert_eq!(state.next_id, 0);
-        assert_eq!(state.entries.len(), 0);
-
-        state.append(entry);
-
-        assert_eq!(state.next_id, 1);
-        assert_eq!(state.entries.len(), 1);
-        let (id, _) = state.entries.remove(0).unwrap();
-        assert_eq!(id, 0);
-    }
-
-    #[test]
-    fn test_next_batch_empty() {
-        let mut state = State::new(false, 1, None, 0);
-
-        assert!(state.next_batch(None, None, 1, 1).is_none());
-        assert!(state.next_batch(Some(1), None, 1, 1).is_none());
-    }
-
-    #[test]
-    fn test_next_batch_min_size() {
-        let mut state = State::new(false, 1, None, 0);
-        let (entry1, _guard1) = default_entry();
-        let (entry2, _guard2) = default_entry();
-        state.append(entry1);
-        state.append(entry2);
-
-        let (entries, batch, _) = state.next_batch(None, None, 2, 2).unwrap();
-        assert_eq!(entries.len(), 2);
-        assert!(entries.contains_key(&0));
-        assert!(entries.contains_key(&1));
-        assert!(entries.get(&0).unwrap().batch_time.is_some());
-        assert!(entries.get(&1).unwrap().batch_time.is_some());
-        assert_eq!(batch.id, 0);
-        assert_eq!(batch.size, 2);
-
-        assert_eq!(state.next_id, 2);
-        assert_eq!(state.entries.len(), 0);
-        assert_eq!(state.next_batch_id, 1);
-
-        let (entry3, _guard3) = default_entry();
-        state.append(entry3);
-
-        assert!(state.next_batch(Some(2), None, 2, 2).is_none());
-
-        assert_eq!(state.next_id, 3);
-        assert_eq!(state.entries.len(), 1);
-        let (id, _) = state.entries.remove(0).unwrap();
-        assert_eq!(id, 2);
-    }
-
-    #[test]
-    fn test_next_batch_max_size() {
-        let mut state = State::new(false, 1, None, 0);
-        let (entry1, _guard1) = default_entry();
-        let (entry2, _guard2) = default_entry();
-        state.append(entry1);
-        state.append(entry2);
-
-        let (entries, batch, _) = state.next_batch(None, Some(1), 2, 2).unwrap();
-        assert_eq!(entries.len(), 1);
-        assert!(entries.contains_key(&0));
-        assert!(entries.get(&0).unwrap().batch_time.is_some());
-        assert_eq!(batch.id, 0);
-        assert_eq!(batch.size, 1);
-
-        assert_eq!(state.next_id, 2);
-        assert_eq!(state.entries.len(), 1);
-        assert_eq!(state.next_batch_id, 1);
-    }
-
-    #[test]
-    fn test_next_batch_token_budget() {
-        let mut state = State::new(false, 1, None, 0);
-        let (entry1, _guard1) = default_entry();
-        let (entry2, _guard2) = default_entry();
-        state.append(entry1);
-        state.append(entry2);
-
-        let (entries, batch, _) = state.next_batch(None, None, 1, 1).unwrap();
-        assert_eq!(entries.len(), 1);
-        assert!(entries.contains_key(&0));
-        assert_eq!(batch.id, 0);
-        assert_eq!(batch.size, 1);
-
-        assert_eq!(state.next_id, 2);
-        assert_eq!(state.entries.len(), 1);
-        assert_eq!(state.next_batch_id, 1);
-
-        let (entry3, _guard3) = default_entry();
-        state.append(entry3);
-
-        let (entries, batch, _) = state.next_batch(None, None, 3, 3).unwrap();
-        assert_eq!(entries.len(), 2);
-        assert!(entries.contains_key(&1));
-        assert!(entries.contains_key(&2));
-        assert_eq!(batch.id, 1);
-        assert_eq!(batch.size, 2);
-
-        assert_eq!(state.next_id, 3);
-        assert_eq!(state.entries.len(), 0);
-        assert_eq!(state.next_batch_id, 2);
-    }
-
-    #[tokio::test]
-    async fn test_queue_append() {
-        let queue = Queue::new(false, 1, None, 0);
-        let (entry, _guard) = default_entry();
-        queue.append(entry);
-    }
-
-    #[tokio::test]
-    async fn test_queue_next_batch_empty() {
-        let queue = Queue::new(false, 1, None, 0);
-
-        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
-        assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none());
-    }
-
-    #[tokio::test]
-    async fn test_queue_next_batch_min_size() {
-        let queue = Queue::new(false, 1, None, 0);
-        let (entry1, _guard1) = default_entry();
-        let (entry2, _guard2) = default_entry();
-        queue.append(entry1);
-        queue.append(entry2);
-
-        let (entries, batch, _) = queue.next_batch(None, None, 2, 2).await.unwrap();
-        assert_eq!(entries.len(), 2);
-        assert!(entries.contains_key(&0));
-        assert!(entries.contains_key(&1));
-        assert!(entries.get(&0).unwrap().batch_time.is_some());
-        assert!(entries.get(&1).unwrap().batch_time.is_some());
-        assert_eq!(batch.id, 0);
-        assert_eq!(batch.size, 2);
-
-        let (entry3, _guard3) = default_entry();
-        queue.append(entry3);
-
-        // Not enough requests pending
-        assert!(queue.next_batch(Some(2), None, 2, 2).await.is_none());
-        // Not enough token budget
-        assert!(queue.next_batch(Some(1), None, 0, 0).await.is_none());
-        // Ok
-        let (entries2, batch2, _) = queue.next_batch(Some(1), None, 2, 2).await.unwrap();
-        assert_eq!(entries2.len(), 1);
-        assert!(entries2.contains_key(&2));
-        assert!(entries2.get(&2).unwrap().batch_time.is_some());
-        assert_eq!(batch2.id, 1);
-        assert_eq!(batch2.size, 1);
-    }
-
-    #[tokio::test]
-    async fn test_queue_next_batch_max_size() {
-        let queue = Queue::new(false, 1, None, 0);
-        let (entry1, _guard1) = default_entry();
-        let (entry2, _guard2) = default_entry();
-        queue.append(entry1);
-        queue.append(entry2);
-
-        let (entries, batch, _) = queue.next_batch(None, Some(1), 2, 2).await.unwrap();
-        assert_eq!(entries.len(), 1);
-        assert!(entries.contains_key(&0));
-        assert!(entries.get(&0).unwrap().batch_time.is_some());
-        assert_eq!(batch.id, 0);
-        assert_eq!(batch.size, 1);
-    }
-
-    #[tokio::test]
-    async fn test_queue_next_batch_token_budget() {
-        let queue = Queue::new(false, 1, None, 0);
-        let (entry1, _guard1) = default_entry();
-        let (entry2, _guard2) = default_entry();
-        queue.append(entry1);
-        queue.append(entry2);
-
-        let (entries, batch, _) = queue.next_batch(None, None, 1, 1).await.unwrap();
-        assert_eq!(entries.len(), 1);
-        assert!(entries.contains_key(&0));
-        assert_eq!(batch.id, 0);
-        assert_eq!(batch.size, 1);
-
-        let (entry3, _guard3) = default_entry();
-        queue.append(entry3);
-
-        let (entries, batch, _) = queue.next_batch(None, None, 3, 3).await.unwrap();
-        assert_eq!(entries.len(), 2);
-        assert!(entries.contains_key(&1));
-        assert!(entries.contains_key(&2));
-        assert_eq!(batch.id, 1);
-        assert_eq!(batch.size, 2);
-    }
-
-    #[tokio::test]
-    async fn test_queue_next_batch_token_speculate() {
-        let queue = Queue::new(false, 1, None, 2);
-        let (entry1, _guard1) = default_entry();
-        let (entry2, _guard2) = default_entry();
-        queue.append(entry1);
-        queue.append(entry2);
-
-        // Budget of 1 is not enough
-        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
-
-        let (entries, batch, _) = queue.next_batch(None, None, 6, 6).await.unwrap();
-        assert_eq!(entries.len(), 2);
-        assert!(entries.contains_key(&0));
-        assert!(entries.contains_key(&1));
-        assert_eq!(batch.id, 0);
-        assert_eq!(batch.size, 2);
-    }
-
-    #[tokio::test]
-    async fn test_queue_next_batch_dropped_receiver() {
-        let queue = Queue::new(false, 1, None, 0);
-        let (entry, _) = default_entry();
-        queue.append(entry);
-
-        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
-    }
-}
--- a/server/text_generation_server/models/cache_manager.py
+++ b/server/text_generation_server/models/cache_manager.py
-import math
-import torch
-
-from typing import Optional, List, Tuple
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
-
-BLOCK_SIZE: int = 16
-# Will be set in warmup
-CACHE_MANAGER: Optional["CacheManager"] = None
-
-
-class CacheManager:
-    def __init__(
-        self,
-        num_blocks: int,
-        num_layers: int,
-        num_heads: int,
-        head_size: int,
-        repeat_slots: bool,
-        dtype: torch.dtype,
-        device: torch.device,
-    ):
-        self.block_size = BLOCK_SIZE
-        self.num_blocks = num_blocks
-        self.repeat_slots = repeat_slots
-
-        element_size = torch.tensor([], dtype=dtype).element_size()
-        if IS_XPU_SYSTEM:
-            x = 1
-        else:
-            x = self.block_size // element_size
-
-        self.kv_cache = [
-            (
-                torch.empty(
-                    (num_blocks, num_heads, head_size // x, self.block_size, x),
-                    dtype=dtype,
-                    device=device,
-                ),
-                torch.empty(
-                    (num_blocks, num_heads, head_size, self.block_size),
-                    dtype=dtype,
-                    device=device,
-                ),
-            )
-            for _ in range(num_layers)
-        ]
-        self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
-        self.slots = torch.arange(
-            0, num_blocks * self.block_size, dtype=torch.int64
-        ).view(num_blocks, self.block_size)
-
-    def allocate(
-        self,
-        needed_blocks_slots: List[Tuple[int, int]],
-        blocks: int,
-        max_blocks: int,
-        device: torch.device,
-    ):
-        # Get free blocks indices by finding values in mask that are not set to 0
-        free_block_indices = self.free_block_mask.nonzero()
-        if blocks > len(free_block_indices):
-            raise RuntimeError(
-                f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
-            )
-
-        # Slice by the number of required blocks
-        block_indices = free_block_indices[:blocks]
-        block_indices = block_indices.flatten()
-
-        # Padded block tables
-        block_tables_tensor = torch.zeros(
-            (len(needed_blocks_slots), max_blocks), dtype=torch.int32
-        )
-
-        # Allocate paged attention blocks
-        cumulative_blocks = 0
-        slots = []
-        block_tables = []
-        for i, (needed_blocks, needed_slots) in enumerate(needed_blocks_slots):
-            # Get allocated blocks for this sequence
-            allocated_blocks = block_indices[
-                cumulative_blocks : cumulative_blocks + needed_blocks
-            ]
-            # Get slots for the allocated blocks
-            all_slots = self.slots[allocated_blocks].flatten()
-
-            # Repeat slots in the case of context sliding window
-            if needed_slots > len(all_slots) and self.repeat_slots:
-                repeats = math.ceil(needed_slots / len(all_slots))
-                all_slots = all_slots.repeat(repeats)
-
-            allocated_slots = all_slots[:needed_slots]
-
-            slots.append(allocated_slots)
-            block_tables.append(allocated_blocks.tolist())
-            block_tables_tensor[i, :needed_blocks] = allocated_blocks
-            cumulative_blocks += needed_blocks
-
-        block_tables = block_tables
-        block_tables_tensor = block_tables_tensor.to(device)
-        slots = torch.concat(slots).to(device)
-
-        # Allocate the required number of blocks by setting the mask to 0
-        self.free_block_mask[block_indices] = 0
-
-        return block_tables, block_tables_tensor, slots
-
-    def free(self, block_indices: Optional[List[int]]):
-        if block_indices is not None and block_indices:
-            # Reset mask
-            self.free_block_mask[block_indices] = 1
-
-
-def set_cache_manager(
-    num_blocks: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    repeat_slots: bool,
-    dtype: torch.dtype,
-    device: torch.device,
-) -> CacheManager:
-    global CACHE_MANAGER
-    if CACHE_MANAGER is not None:
-        del CACHE_MANAGER
-        torch.cuda.empty_cache()
-
-    CACHE_MANAGER = CacheManager(
-        num_blocks, num_layers, num_heads, head_size, repeat_slots, dtype, device
-    )
-    return CACHE_MANAGER
-
-
-def get_cache_manager() -> CacheManager:
-    global CACHE_MANAGER
-    if CACHE_MANAGER is None:
-        raise RuntimeError("cache manager was not initialized")
-
-    return CACHE_MANAGER
--- a/server/text_generation_server/utils/awq/conversion_utils.py
+++ b/server/text_generation_server/utils/awq/conversion_utils.py
-import torch
-from typing import List
-
-
-AWQ_PACK_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
-REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
-
-
-def pack(imatrix: torch.Tensor, direction: str = "column"):
-    """
-    Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
-    Args:
-        imatrix (torch.Tensor): matrix of integers
-        direction (str): direction of packing, either "column" or "row"
-    Returns:
-        qmatrix (torch.Tensor): packed matrix of integers
-    """
-    shifts = torch.arange(0, 32, 4, dtype=torch.int32, device=imatrix.device)
-
-    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
-
-    if direction == "column":
-        imatrix = imatrix.view(-1, imatrix.shape[1] // (32 // 4), (32 // 4))
-        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, :]).sum(dim=-1)
-
-    elif direction == "row":
-        imatrix = imatrix.view(imatrix.shape[0] // (32 // 4), (32 // 4), -1)
-        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, :, None]).sum(dim=1)
-
-    qmatrix = qmatrix.to(torch.int32)
-
-    return qmatrix
-
-
-def unpack(qmatrix: torch.Tensor, direction: str = "column"):
-    """
-    Unpacks a 32-bit packed integer matrix into a 4-bit integer matrix.
-    Args:
-        qmatrix (torch.Tensor): matrix of packed integers
-        direction (str): direction of unpacking, either "column" or "row"
-    Returns:
-        imatrix (torch.Tensor): matrix of integers
-    """
-    shifts = torch.arange(0, 32, 4, device=qmatrix.device)
-
-    if direction == "column":
-        imatrix = torch.bitwise_right_shift(
-            qmatrix[:, :, None], shifts[None, None, :]
-        ).view(qmatrix.shape[0], -1)
-
-    elif direction == "row":
-        imatrix = torch.bitwise_right_shift(
-            qmatrix[:, None, :], shifts[None, :, None]
-        ).view(-1, qmatrix.shape[-1])
-
-    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
-
-    return imatrix
-
-
-def apply_order(
-    imatrix: torch.Tensor,
-    direction: str = "column",
-    order: List[int] = AWQ_PACK_ORDER,
-):
-    """
-    Applies the order to a 4-bit integer matrix.
-    Args:
-        imatrix (torch.Tensor): matrix of integers
-        direction (str): direction of applying order, either "column" or "row"
-        order (List[int]): order to apply, default is AWQ_PACK_ORDER
-    Returns:
-        imatrix (torch.Tensor): matrix of integers
-    """
-    if direction == "column":
-        imatrix = imatrix.view(-1, (32 // 4))[:, order].view(imatrix.shape)
-    elif direction == "row":
-        imatrix = imatrix.view((32 // 4), -1)[order, :].view(imatrix.shape)
-
-    return imatrix
-
-
-def fast_awq_to_gptq(qweight, qzeros):
-    # awq uses column packing for both weights and zeros
-    izeros = unpack(qzeros, direction="column")
-    iweights = unpack(qweight, direction="column")
-
-    # Reverse the order of the iweight and izeros tensors
-    izeros = apply_order(izeros, direction="column", order=REVERSE_AWQ_PACK_ORDER)
-    iweights = apply_order(iweights, direction="column", order=REVERSE_AWQ_PACK_ORDER)
-    # Subtract 1 from the izeros tensor (gptq adds 1 to the zeros)
-    izeros = izeros - 1
-    # exllama uses row packing for weights and column packing for zeros
-    qzeros = pack(izeros, direction="column")
-    qweight = pack(iweights, direction="row")
-
-    return qweight, qzeros
--- a/server/text_generation_server/utils/awq/quantize/qmodule.py
+++ b/server/text_generation_server/utils/awq/quantize/qmodule.py
-# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py
-
-import math
-import torch
-import torch.nn as nn
-import awq_inference_engine  # with CUDA kernels
-
-
-# class ScaledActivation(nn.Module):
-#     def __init__(self, module, scales):
-#         super().__init__()
-#         self.act = module
-#         self.scales = nn.Parameter(scales.data)
-#
-#     def forward(self, x):
-#         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
-
-
-class WQLinear(nn.Module):
-    def __init__(self, w_bit, group_size, qweight, qzeros, scales, bias):
-        super().__init__()
-
-        if w_bit not in [4]:
-            raise NotImplementedError("Only 4-bit are supported for now.")
-
-        self.in_features = qweight.shape[0]
-        self.out_features = qweight.shape[1] * 32 // w_bit
-
-        self.w_bit = w_bit
-        self.group_size = group_size if group_size != -1 else self.in_features
-        # quick sanity check (make sure aligment)
-        assert self.in_features % self.group_size == 0
-        assert self.out_features % (32 // self.w_bit) == 0
-
-        self.qweight = qweight
-        self.qzeros = qzeros
-        self.scales = scales
-        if bias:
-            self.bias = bias
-        else:
-            self.bias = None
-
-    @torch.no_grad()
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.out_features,)
-        out = awq_inference_engine.gemm_forward_cuda(
-            x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
--- a/server/text_generation_server/utils/flash_attn.py
+++ b/server/text_generation_server/utils/flash_attn.py
-import os
-import torch
-
-from loguru import logger
-import math
-
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
-
-if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
-    raise ImportError("`USE_FLASH_ATTENTION` is false.")
-HAS_FLASH_ATTN = True
-HAS_FLASH_ATTN_V2_CUDA = False
-HAS_FLASH_ATTN_V2_ROCM = False
-
-if IS_XPU_SYSTEM:
-    import intel_extension_for_pytorch as ipex
-
-if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-    if not torch.cuda.is_available():
-        raise ImportError("CUDA is not available")
-
-    major, minor = torch.cuda.get_device_capability()
-    is_sm75 = major == 7 and minor == 5
-    is_sm8x = major == 8 and minor >= 0
-    is_sm90 = major == 9 and minor == 0
-
-    HAS_FLASH_ATTN = False
-    HAS_FLASH_ATTN_V2_CUDA = False
-    HAS_FLASH_ATTN_V2_ROCM = False
-    try:
-        try:
-            import flash_attn_2_cuda
-        except ImportError:
-            architecture_suffix = ""
-            if IS_CUDA_SYSTEM:
-                architecture_suffix = "-cuda"
-            elif IS_ROCM_SYSTEM:
-                architecture_suffix = "-rocm"
-            raise ImportError(
-                "Flash Attention V2 is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
-            )
-        if not (is_sm8x or is_sm90) and IS_CUDA_SYSTEM:
-            raise ImportError(
-                f"GPU with CUDA capability {major} {minor} is not supported for "
-                "Flash Attention V2"
-            )
-        HAS_FLASH_ATTN_V2_CUDA = IS_CUDA_SYSTEM
-        HAS_FLASH_ATTN_V2_ROCM = IS_ROCM_SYSTEM
-    except ImportError as e:
-        try:
-            import flash_attn_cuda
-        except ImportError:
-            raise ImportError(
-                "Flash Attention is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                "or install flash attention with `cd server && make install install-flash-attention`"
-            ) from e
-
-        if IS_CUDA_SYSTEM and not (is_sm75 or is_sm8x or is_sm90):
-            raise ImportError(
-                f"GPU with CUDA capability {major} {minor} is not supported"
-            ) from e
-        elif IS_ROCM_SYSTEM:
-            for idx in range(torch.cuda.device_count()):
-                if "MI210" not in torch.cuda.get_device_name(
-                    idx
-                ) and "MI250" not in torch.cuda.get_device_name(idx):
-                    raise ImportError(
-                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
-                    )
-
-        logger.warning(f"Unable to use Flash Attention V2: {e}")
-        HAS_FLASH_ATTN = True
-
-
-def attention(
-    q,
-    k,
-    v,
-    out,
-    cu_seqlens,
-    max_s,
-    softmax_scale,
-    window_size_left=-1,
-):
-    if window_size_left <= 0 and window_size_left != -1:
-        raise ValueError("`window_size_left` must be > 0 or -1")
-
-    if IS_XPU_SYSTEM:
-        if window_size_left != -1:
-            raise ValueError(
-                f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
-            )
-        return ipex.llm.functional.varlen_attention(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            False,
-            None,
-        )
-
-    if HAS_FLASH_ATTN_V2_CUDA:
-        return flash_attn_2_cuda.varlen_fwd(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            None,
-            None,
-            None,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            window_size_left,
-            0,
-            False,
-            None,
-        )
-    elif HAS_FLASH_ATTN_V2_ROCM:
-        if window_size_left != -1:
-            raise ValueError(
-                f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
-            )
-
-        # RoCm flash API does not take the window_size_left and window_size_right arguments.
-        return flash_attn_2_cuda.varlen_fwd(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            False,
-            None,
-        )
-    elif HAS_FLASH_ATTN:
-        if window_size_left != -1:
-            raise NotImplementedError(
-                "window_size_left is only available with flash attn v2"
-            )
-
-        # Flash attention v1 requires q, k and v to have the same number of heads
-        if k.shape[1] != q.shape[1]:
-            # MQA expand
-            if k.shape[1] == 1:
-                k = k.expand(-1, q.shape[1], -1)
-            # Grouped attention reshape
-            else:
-                original_shape = k.shape
-                k = (
-                    k.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
-                    .reshape(original_shape[0], -1, original_shape[2])
-                )
-        if v.shape[1] != q.shape[1]:
-            # MQA expand
-            if v.shape[1] == 1:
-                v = v.expand(-1, q.shape[1], -1)
-            # Grouped attention reshape
-            else:
-                original_shape = v.shape
-                v = (
-                    v.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
-                    .reshape(original_shape[0], -1, original_shape[2])
-                )
-
-        return flash_attn_cuda.fwd(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            False,
-            0,
-            None,
-        )
-
-    raise NotImplementedError("flash attention is not installed")
--- a/server/text_generation_server/utils/gptq/custom_autotune.py
+++ b/server/text_generation_server/utils/gptq/custom_autotune.py
-# https://github.com/fpgaminer/GPTQ-triton
-"""
-Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
-"""
-
-import builtins
-import math
-import time
-from typing import Dict
-
-import triton
-
-
-class Autotuner(triton.KernelInterface):
-    def __init__(
-        self,
-        fn,
-        arg_names,
-        configs,
-        key,
-        reset_to_zero,
-        prune_configs_by: Dict = None,
-        nearest_power_of_two: bool = False,
-    ):
-        """
-        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
-                'perf_model': performance model used to predicate running time with different configs, returns running time
-                'top_k': number of configs to bench
-                'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
-                'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
-        """
-        if not configs:
-            self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
-        else:
-            self.configs = configs
-        self.key_idx = [arg_names.index(k) for k in key]
-        self.nearest_power_of_two = nearest_power_of_two
-        self.cache = {}
-        # hook to reset all required tensor to zeros before relaunching a kernel
-        self.hook = lambda args: 0
-        if reset_to_zero is not None:
-            self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
-
-            def _hook(args):
-                for i in self.reset_idx:
-                    args[i].zero_()
-
-            self.hook = _hook
-        self.arg_names = arg_names
-        # prune configs
-        if prune_configs_by:
-            perf_model, top_k = (
-                prune_configs_by["perf_model"],
-                prune_configs_by["top_k"],
-            )
-            if "early_config_prune" in prune_configs_by:
-                early_config_prune = prune_configs_by["early_config_prune"]
-        else:
-            perf_model, top_k, early_config_prune = None, None, None
-        self.perf_model, self.configs_top_k = perf_model, top_k
-        self.early_config_prune = early_config_prune
-        self.fn = fn
-
-    def _bench(self, *args, config, **meta):
-        # check for conflicts, i.e. meta-parameters both provided
-        # as kwargs and by the autotuner
-        conflicts = meta.keys() & config.kwargs.keys()
-        if conflicts:
-            raise ValueError(
-                f"Conflicting meta-parameters: {', '.join(conflicts)}."
-                " Make sure that you don't re-define auto-tuned symbols."
-            )
-        # augment meta-parameters with tunable ones
-        current = dict(meta, **config.kwargs)
-
-        def kernel_call():
-            if config.pre_hook:
-                config.pre_hook(self.nargs)
-            self.hook(args)
-            self.fn.run(
-                *args,
-                num_warps=config.num_warps,
-                num_stages=config.num_stages,
-                **current,
-            )
-
-        try:
-            # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
-            # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
-            return triton.testing.do_bench(
-                kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40
-            )
-        except triton.OutOfResources:
-            return (float("inf"), float("inf"), float("inf"))
-
-    def run(self, *args, **kwargs):
-        self.nargs = dict(zip(self.arg_names, args))
-        if len(self.configs) > 1:
-            key = tuple(args[i] for i in self.key_idx)
-
-            # This reduces the amount of autotuning by rounding the keys to the nearest power of two
-            # In my testing this gives decent results, and greatly reduces the amount of tuning required
-            if self.nearest_power_of_two:
-                key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])
-
-            if key not in self.cache:
-                # prune configs
-                pruned_configs = self.prune_configs(kwargs)
-                bench_start = time.time()
-                timings = {
-                    config: self._bench(*args, config=config, **kwargs)
-                    for config in pruned_configs
-                }
-                bench_end = time.time()
-                self.bench_time = bench_end - bench_start
-                self.cache[key] = builtins.min(timings, key=timings.get)
-                self.hook(args)
-                self.configs_timings = timings
-            config = self.cache[key]
-        else:
-            config = self.configs[0]
-        self.best_config = config
-        if config.pre_hook is not None:
-            config.pre_hook(self.nargs)
-        return self.fn.run(
-            *args,
-            num_warps=config.num_warps,
-            num_stages=config.num_stages,
-            **kwargs,
-            **config.kwargs,
-        )
-
-    def prune_configs(self, kwargs):
-        pruned_configs = self.configs
-        if self.early_config_prune:
-            pruned_configs = self.early_config_prune(self.configs, self.nargs)
-        if self.perf_model:
-            top_k = self.configs_top_k
-            if isinstance(top_k, float) and top_k <= 1.0:
-                top_k = int(len(self.configs) * top_k)
-            if len(pruned_configs) > top_k:
-                est_timing = {
-                    config: self.perf_model(
-                        **self.nargs,
-                        **kwargs,
-                        **config.kwargs,
-                        num_stages=config.num_stages,
-                        num_warps=config.num_warps,
-                    )
-                    for config in pruned_configs
-                }
-                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[
-                    :top_k
-                ]
-        return pruned_configs
-
-    def warmup(self, *args, **kwargs):
-        self.nargs = dict(zip(self.arg_names, args))
-        for config in self.prune_configs(kwargs):
-            self.fn.warmup(
-                *args,
-                num_warps=config.num_warps,
-                num_stages=config.num_stages,
-                **kwargs,
-                **config.kwargs,
-            )
-        self.nargs = None
-
-
-def autotune(
-    configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False
-):
-    """
-    Decorator for auto-tuning a :code:`triton.jit`'d function.
-    .. highlight:: python
-    .. code-block:: python
-            @triton.autotune(configs=[
-                    triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
-                    triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
-                    ],
-                    key=['x_size'] # the two above configs will be evaluated anytime
-                                                    # the value of x_size changes
-            )
-            @triton.jit
-            def kernel(x_ptr, x_size, **META):
-                    BLOCK_SIZE = META['BLOCK_SIZE']
-    :note: When all the configurations are evaluated, the kernel will run multiple time.
-                    This means that whatever value the kernel updates will be updated multiple times.
-                    To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
-                    reset the value of the provided tensor to `zero` before running any configuration.
-    :param configs: a list of :code:`triton.Config` objects
-    :type configs: list[triton.Config]
-    :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
-    :type key: list[str]
-    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
-            'perf_model': performance model used to predicate running time with different configs, returns running time
-            'top_k': number of configs to bench
-            'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
-    :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
-    :type reset_to_zero: list[str]
-    """
-
-    def decorator(fn):
-        return Autotuner(
-            fn,
-            fn.arg_names,
-            configs,
-            key,
-            reset_to_zero,
-            prune_configs_by,
-            nearest_power_of_two,
-        )
-
-    return decorator
-
-
-def matmul248_kernel_config_pruner(configs, nargs):
-    """
-    The main purpose of this function is to shrink BLOCK_SIZE_* when the corresponding dimension is smaller.
-    """
-    m = max(2 ** int(math.ceil(math.log2(nargs["M"]))), 16)
-    n = max(2 ** int(math.ceil(math.log2(nargs["N"]))), 16)
-    k = max(2 ** int(math.ceil(math.log2(nargs["K"]))), 16)
-
-    used = set()
-    for config in configs:
-        block_size_m = min(m, config.kwargs["BLOCK_SIZE_M"])
-        block_size_n = min(n, config.kwargs["BLOCK_SIZE_N"])
-        block_size_k = min(k, config.kwargs["BLOCK_SIZE_K"])
-        group_size_m = config.kwargs["GROUP_SIZE_M"]
-
-        if (
-            block_size_m,
-            block_size_n,
-            block_size_k,
-            group_size_m,
-            config.num_stages,
-            config.num_warps,
-        ) in used:
-            continue
-
-        used.add(
-            (
-                block_size_m,
-                block_size_n,
-                block_size_k,
-                group_size_m,
-                config.num_stages,
-                config.num_warps,
-            )
-        )
-        yield triton.Config(
-            {
-                "BLOCK_SIZE_M": block_size_m,
-                "BLOCK_SIZE_N": block_size_n,
-                "BLOCK_SIZE_K": block_size_k,
-                "GROUP_SIZE_M": group_size_m,
-            },
-            num_stages=config.num_stages,
-            num_warps=config.num_warps,
-        )
--- a/server/text_generation_server/utils/gptq/exllama.py
+++ b/server/text_generation_server/utils/gptq/exllama.py
-import torch
-from exllama_kernels import make_q4, q4_matmul, prepare_buffers, set_tuning_params
-
-# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
-none_tensor = torch.empty((1, 1), device="meta")
-
-
-def ext_make_q4(qweight, qzeros, scales, g_idx, device):
-    """Construct Q4Matrix, return handle"""
-    return make_q4(
-        qweight, qzeros, scales, g_idx if g_idx is not None else none_tensor, device
-    )
-
-
-def ext_q4_matmul(x, q4, q4_width):
-    """Matrix multiplication, returns x @ q4"""
-    outshape = x.shape[:-1] + (q4_width,)
-    x = x.view(-1, x.shape[-1])
-    output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
-
-    q4_matmul(x, q4, output)
-
-    return output.view(outshape)
-
-
-MAX_DQ = 1
-MAX_INNER = 1
-ACT_ORDER = False
-DEVICE = None
-
-TEMP_STATE = None
-TEMP_DQ = None
-
-
-def set_device(device):
-    global DEVICE
-    DEVICE = device
-
-
-def create_exllama_buffers(max_total_tokens: int):
-    global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE, TEMP_STATE, TEMP_DQ
-
-    assert DEVICE is not None, "call set_device first"
-
-    if not ACT_ORDER:
-        max_total_tokens = 1
-
-    # This temp_state buffer is required to reorder X in the act-order case.
-    temp_state = torch.zeros(
-        (max_total_tokens, MAX_INNER), dtype=torch.float16, device=DEVICE
-    )
-    temp_dq = torch.zeros((1, MAX_DQ), dtype=torch.float16, device=DEVICE)
-
-    # This temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
-    prepare_buffers(DEVICE, temp_state, temp_dq)
-
-    matmul_recons_thd = 8
-    matmul_fused_remap = False
-    matmul_no_half2 = False
-    set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
-
-    TEMP_STATE, TEMP_DQ = temp_state, temp_dq
-
-
-class Ex4bitLinear(torch.nn.Module):
-    """Linear layer implementation with per-group 4-bit quantization of the weights"""
-
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
-        assert bits == 4
-
-        self.device = qweight.device
-        self.qweight = qweight
-        self.qzeros = qzeros
-        self.scales = scales
-        self.g_idx = g_idx.cpu() if g_idx is not None else None
-        self.bias = bias if bias is not None else None
-
-        if self.g_idx is not None and (
-            (self.g_idx == 0).all()
-            or torch.equal(
-                g_idx.cpu(),
-                torch.tensor(
-                    [i // groupsize for i in range(g_idx.shape[0])], dtype=torch.int32
-                ),
-            )
-        ):
-            self.empty_g_idx = True
-            self.g_idx = None
-
-        assert self.device.type == "cuda"
-        assert self.device.index is not None
-
-        self.q4 = ext_make_q4(
-            self.qweight, self.qzeros, self.scales, self.g_idx, self.device.index
-        )
-
-        self.height = qweight.shape[0] * 8
-        self.width = qweight.shape[1]
-
-        # Infer groupsize from height of qzeros
-        self.groupsize = None
-        if self.qzeros.shape[0] > 1:
-            self.groupsize = (self.qweight.shape[0] * 8) // (self.qzeros.shape[0])
-
-        if self.groupsize is not None:
-            assert groupsize == self.groupsize
-
-        # Handle act-order matrix
-        if self.g_idx is not None:
-            if self.groupsize is None:
-                raise ValueError("Found group index but no groupsize. What do?")
-            self.act_order = True
-        else:
-            self.act_order = False
-
-        DEVICE = self.qweight.device
-
-        MAX_DQ = max(MAX_DQ, self.qweight.numel() * 8)
-
-        if self.act_order:
-            MAX_INNER = max(MAX_INNER, self.height, self.width)
-
-            ACT_ORDER = True
-
-    def forward(self, x):
-        out = ext_q4_matmul(x, self.q4, self.width)
-
-        if self.bias is not None:
-            out.add_(self.bias)
-        return out
--- a/server/text_generation_server/utils/gptq/exllamav2.py
+++ b/server/text_generation_server/utils/gptq/exllamav2.py
-# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
-
-import torch
-import torch.nn as nn
-
-from loguru import logger
-
-try:
-    from exllamav2_kernels import make_q_matrix, gemm_half_q_half
-except ImportError:
-    logger.error("exllamav2_kernels not installed.")
-    raise
-
-# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
-none_tensor = torch.empty((1, 1), device="meta")
-
-
-def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
-    """Matrix multiplication, returns x @ q4"""
-    output_shape = x.shape[:-1] + (q4_width,)
-    x = x.view(-1, x.shape[-1])
-    output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
-    gemm_half_q_half(x, q_handle, output, force_cuda)
-    return output.view(output_shape)
-
-
-# Group map needed for irregular group sizes
-
-
-def make_group_map(q_groups, num_qrows):
-
-    gr = q_groups.tolist()
-    group_map = []
-    num_groups = len(gr) // 2
-
-    for i in range(num_groups):
-        bits = gr[i * 2]
-        if i < num_groups - 1:
-            qrows = gr[i * 2 + 3] - gr[i * 2 + 1]
-        else:
-            qrows = num_qrows - gr[i * 2 + 1]
-        rows = qrows * 32 // bits
-        for j in range(rows):
-            group_map += [i]
-            group_map += [rows - j]
-
-    return torch.tensor(group_map, dtype=torch.short, device=q_groups.device)
-
-
-# Create Q matrix
-
-
-def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
-    """
-    Create Q matrix
-    """
-    # EXL2
-    # won't work as the moment because the tensors are not the same.
-    if "q_weight" in w:
-        w["q_scale_max"] /= 256
-        w["q_perm"] = w["q_perm"].short()
-        w["q_invperm"] = w["q_invperm"].short()
-
-        if "q_group_map" not in w:
-            w["q_group_map"] = make_group_map(w["q_groups"], w["q_weight"].shape[0])
-
-        return make_q_matrix(
-            w["q_weight"],
-            w["q_perm"],
-            w["q_invperm"],
-            w["q_scale"],
-            w["q_scale_max"],
-            w["q_groups"],
-            w["q_group_map"],
-            none_tensor,
-            none_tensor,
-            none_tensor,
-            temp_dq,
-        )
-    # GPTQ
-    elif "qweight" in w:
-        if w["scales"].dtype == torch.float:
-            w["scales"] = w["scales"].half()
-
-        # GPTQ with g_idx (act_order)
-        if w.get("g_idx", None) is not None and not (w["g_idx"] == 0).all().item():
-            w["q_perm"] = torch.empty(
-                (w["qweight"].shape[0] * 8,),
-                dtype=torch.short,
-                device=w["qweight"].device,
-            )
-            w["q_invperm"] = torch.empty_like(w["q_perm"])
-            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
-            return make_q_matrix(
-                w["qweight"],
-                w["q_perm"],
-                w["q_invperm"],
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                w["qzeros"],
-                w["scales"],
-                w["g_idx"].cpu(),
-                temp_dq,
-            )
-        # GPTQ without g_idx
-        else:
-            return make_q_matrix(
-                w["qweight"],
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                w["qzeros"],
-                w["scales"],
-                none_tensor,
-                temp_dq,
-            )
-
-
-DEVICE = None
-FIXED_BYTES = 0
-LAYERS = []
-
-
-def set_device(device):
-    global DEVICE
-    DEVICE = device
-
-
-def create_exllama_buffers(max_total_tokens: int):
-    global FIXED_BYTES, LAYERS, DEVICE
-    temp_dq = ExLlamaV2DeviceTensors(DEVICE, FIXED_BYTES)
-
-    for layer in LAYERS:
-        layer.post_init(temp_dq)
-
-
-class QuantLinear(nn.Module):
-    QUANT_TYPE = "exllamav2"
-
-    """Linear layer implementation with per-group 4-bit quantization of the weights"""
-
-    # def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        if bits != 4:
-            raise ValueError(
-                f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization."
-            )
-        self.q_handle = None
-        self.q_tensors = None
-        self.bits = bits
-        self.maxq = 2**self.bits - 1
-        self.infeatures = qweight.shape[0] // self.bits * 32
-        self.outfeatures = qweight.shape[1]
-        self.padding = -self.outfeatures % 32
-        self.outfeatures = self.outfeatures + self.padding
-
-        self.device = qweight.device
-        self.qweight = qweight
-        self.qzeros = qzeros
-        self.scales = scales
-        self.g_idx = g_idx
-        self.bias = bias if bias is not None else None
-        self.group_size = groupsize
-
-        global FIXED_BYTES, LAYERS
-        FIXED_BYTES = max(FIXED_BYTES, self.scratch_space_fixed())
-        LAYERS.append(self)
-
-    def post_init(self, temp_dq):
-        assert self.qweight.device.type == "cuda"
-        assert self.qweight.device.index is not None
-        self.q_tensors = {
-            "qweight": self.qweight,
-            "qzeros": self.qzeros,
-            "scales": self.scales,
-            "g_idx": self.g_idx,
-        }
-        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
-
-        # We NEED to keep a pointer on Python side, otherwise the garbage collector will mess with us,
-        # and `Memory access fault by GPU node-2` will EAT you.
-        self.temp_dq = temp_dq
-        self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq)
-
-    def forward(self, x, force_cuda=False):
-        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
-
-        if self.bias is not None:
-            output.add_(self.bias)
-        return output
-
-    def temp_dq_size(self):
-        return self.infeatures * self.outfeatures * 2 + 128
-
-    def temp_fwd_size(self, max_input_len, max_batch_size):
-        return self.outfeatures * max_input_len * max_batch_size * 4 + 128
-
-    def scratch_space_fixed(self, max_input_len=4096, max_batch_size=16):
-        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
-
-
-class ExLlamaV2DeviceTensors:
-
-    device_idx: int
-    scratch_bytes: int
-    scratch_idx: int
-    scratch: torch.tensor = None
-
-    def __init__(self, device, scratch_bytes):
-        self.device = device
-        self.scratch_bytes = scratch_bytes
-
-    def prepare(self):
-        self.scratch = torch.empty(
-            (self.scratch_bytes // 2,), dtype=torch.half, device=self.device
-        )
-
-    def get_scratch_slice(self, size_bytes):
-
-        if self.scratch is None:
-            self.prepare()
-
-        size_bytes = ((size_bytes + 127) // 128) * 128
-        size_half = size_bytes // 2
-        scratch_slice = self.scratch.narrow(0, 0, size_half)
-        return scratch_slice
--- a/server/text_generation_server/utils/gptq/quant_linear.py
+++ b/server/text_generation_server/utils/gptq/quant_linear.py
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.cuda.amp import custom_bwd, custom_fwd
-
-try:
-    import triton
-    import triton.language as tl
-    from . import custom_autotune
-
-    # code based https://github.com/fpgaminer/GPTQ-triton
-    @custom_autotune.autotune(
-        configs=[
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 256,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 64,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 32,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 128,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=4,
-            ),
-        ],
-        key=["M", "N", "K"],
-        nearest_power_of_two=True,
-        prune_configs_by={
-            "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
-            "perf_model": None,
-            "top_k": None,
-        },
-    )
-    @triton.jit
-    def matmul_248_kernel(
-        a_ptr,
-        b_ptr,
-        c_ptr,
-        scales_ptr,
-        zeros_ptr,
-        g_ptr,
-        M,
-        N,
-        K,
-        bits,
-        maxq,
-        stride_am,
-        stride_ak,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_scales,
-        stride_zeros,
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-    ):
-        """
-        Compute the matrix multiplication C = A x B.
-        A is of shape (M, K) float16
-        B is of shape (K//8, N) int32
-        C is of shape (M, N) float16
-        scales is of shape (G, N) float16
-        zeros is of shape (G, N) float16
-        g_ptr is of shape (K) int32
-        """
-        infearure_per_bits = 32 // bits
-
-        pid = tl.program_id(axis=0)
-        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-        num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-        num_pid_in_group = GROUP_SIZE_M * num_pid_n
-        group_id = pid // num_pid_in_group
-        first_pid_m = group_id * GROUP_SIZE_M
-        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-        pid_m = first_pid_m + (pid % group_size_m)
-        pid_n = (pid % num_pid_in_group) // group_size_m
-
-        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        offs_k = tl.arange(0, BLOCK_SIZE_K)
-        a_ptrs = a_ptr + (
-            offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
-        )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        a_mask = offs_am[:, None] < M
-        # b_ptrs is set up such that it repeats elements along the K axis 8 times
-        b_ptrs = b_ptr + (
-            (offs_k[:, None] // infearure_per_bits) * stride_bk
-            + offs_bn[None, :] * stride_bn
-        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-        g_ptrs = g_ptr + offs_k
-        # shifter is used to extract the N bits of each element in the 32-bit word from B
-        scales_ptrs = scales_ptr + offs_bn[None, :]
-        zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
-
-        shifter = (offs_k % infearure_per_bits) * bits
-        zeros_shifter = (offs_bn % infearure_per_bits) * bits
-        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-
-        for k in range(0, num_pid_k):
-            g_idx = tl.load(g_ptrs)
-
-            # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-            scales = tl.load(
-                scales_ptrs + g_idx[:, None] * stride_scales
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-            zeros = tl.load(
-                zeros_ptrs + g_idx[:, None] * stride_zeros
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-
-            zeros = (zeros >> zeros_shifter[None, :]) & maxq
-            zeros = (zeros + 1) & maxq  # eventually avoid overflow
-
-            a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-            b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-            # Now we need to unpack b (which is N-bit values) into 32-bit values
-            b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-            b = (b - zeros) * scales  # Scale and shift
-
-            accumulator += tl.dot(a, b)
-            a_ptrs += BLOCK_SIZE_K
-            b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
-            g_ptrs += BLOCK_SIZE_K
-
-        c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
-        c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
-        tl.store(c_ptrs, accumulator, mask=c_mask)
-
-except:
-    print("triton not installed.")
-
-
-def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    with torch.cuda.device(input.device):
-        output = torch.empty(
-            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
-        )
-        grid = lambda META: (
-            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
-            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
-        )
-        matmul_248_kernel[grid](
-            input,
-            qweight,
-            output,
-            scales,
-            qzeros,
-            g_idx,
-            input.shape[0],
-            qweight.shape[1],
-            input.shape[1],
-            bits,
-            maxq,
-            input.stride(0),
-            input.stride(1),
-            qweight.stride(0),
-            qweight.stride(1),
-            output.stride(0),
-            output.stride(1),
-            scales.stride(0),
-            qzeros.stride(0),
-        )
-        return output
-
-
-class QuantLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
-        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
-        return output
-
-
-class QuantLinear(nn.Module):
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        self.register_buffer("qweight", qweight)
-        self.register_buffer("qzeros", qzeros)
-        self.register_buffer("scales", scales)
-        self.register_buffer("g_idx", g_idx)
-        if bias is not None:
-            self.register_buffer("bias", bias)
-        else:
-            self.bias = None
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        self.bits = bits
-        self.maxq = 2**self.bits - 1
-        self.groupsize = groupsize
-
-        self.outfeatures = qweight.shape[1]
-        self.infeatures = qweight.shape[0] * 32 // bits
-
-    @classmethod
-    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
-        qzeros = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
-            dtype=torch.int32,
-        )
-        scales = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
-        )
-        g_idx = torch.tensor(
-            [i // groupsize for i in range(infeatures)], dtype=torch.int32
-        )
-        if bias:
-            bias = torch.zeros((outfeatures), dtype=torch.float16)
-        else:
-            bias = None
-        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
-
-    def pack(self, linear, scales, zeros, g_idx=None):
-        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
-
-        scales = scales.t().contiguous()
-        zeros = zeros.t().contiguous()
-        scale_zeros = zeros * scales
-        self.scales = scales.clone().half()
-        if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-
-        intweight = []
-        for idx in range(self.infeatures):
-            intweight.append(
-                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
-                    / self.scales[self.g_idx[idx]]
-                ).to(torch.int)[:, None]
-            )
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.numpy().astype(np.uint32)
-        qweight = np.zeros(
-            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
-        )
-        i = 0
-        row = 0
-        while row < qweight.shape[0]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qweight[row] |= intweight[j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                row += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight)
-
-        zeros -= 1
-        zeros = zeros.numpy().astype(np.uint32)
-        qzeros = np.zeros(
-            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
-        )
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                col += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qzeros = qzeros.astype(np.int32)
-        self.qzeros = torch.from_numpy(qzeros)
-
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(
-            x.reshape(-1, x.shape[-1]),
-            self.qweight,
-            self.scales,
-            self.qzeros,
-            self.g_idx,
-            self.bits,
-            self.maxq,
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
--- a/server/text_generation_server/utils/gptq/quantize.py
+++ b/server/text_generation_server/utils/gptq/quantize.py
-import time
-import torch.nn as nn
-import math
-import json
-import os
-import torch
-import transformers
-
-from texttable import Texttable
-from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
-from huggingface_hub import HfApi
-from accelerate import init_empty_weights
-from text_generation_server.utils import initialize_torch_distributed, Weights
-from text_generation_server.utils.hub import weight_files
-from text_generation_server.utils.gptq.quant_linear import QuantLinear
-from loguru import logger
-from typing import Optional
-
-DEV = torch.device("cuda:0")
-
-
-class Quantizer(nn.Module):
-    def __init__(self, shape=1):
-        super(Quantizer, self).__init__()
-        self.register_buffer("maxq", torch.tensor(0))
-        self.register_buffer("scale", torch.zeros(shape))
-        self.register_buffer("zero", torch.zeros(shape))
-
-    def configure(
-        self,
-        bits,
-        perchannel=False,
-        sym=True,
-        mse=False,
-        norm=2.4,
-        grid=100,
-        maxshrink=0.8,
-        trits=False,
-    ):
-        self.maxq = torch.tensor(2**bits - 1)
-        self.perchannel = perchannel
-        self.sym = sym
-        self.mse = mse
-        self.norm = norm
-        self.grid = grid
-        self.maxshrink = maxshrink
-        if trits:
-            self.maxq = torch.tensor(-1)
-        self.scale = torch.zeros_like(self.scale)
-
-    def _quantize(self, x, scale, zero, maxq):
-        if maxq < 0:
-            return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
-        q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
-        return scale * (q - zero)
-
-    def find_params(self, x, weight=False):
-        dev = x.device
-        self.maxq = self.maxq.to(dev)
-
-        shape = x.shape
-        if self.perchannel:
-            if weight:
-                x = x.flatten(1)
-            else:
-                if len(shape) == 4:
-                    x = x.permute([1, 0, 2, 3])
-                    x = x.flatten(1)
-                if len(shape) == 3:
-                    x = x.reshape((-1, shape[-1])).t()
-                if len(shape) == 2:
-                    x = x.t()
-        else:
-            x = x.flatten().unsqueeze(0)
-
-        tmp = torch.zeros(x.shape[0], device=dev)
-        xmin = torch.minimum(x.min(1)[0], tmp)
-        xmax = torch.maximum(x.max(1)[0], tmp)
-
-        if self.sym:
-            xmax = torch.maximum(torch.abs(xmin), xmax)
-            tmp = xmin < 0
-            if torch.any(tmp):
-                xmin[tmp] = -xmax[tmp]
-        tmp = (xmin == 0) & (xmax == 0)
-        xmin[tmp] = -1
-        xmax[tmp] = +1
-
-        if self.maxq < 0:
-            self.scale = xmax
-            self.zero = xmin
-        else:
-            self.scale = (xmax - xmin) / self.maxq
-            if self.sym:
-                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
-            else:
-                self.zero = torch.round(-xmin / self.scale)
-
-        if self.mse:
-            best = torch.full([x.shape[0]], float("inf"), device=dev)
-            for i in range(int(self.maxshrink * self.grid)):
-                p = 1 - i / self.grid
-                xmin1 = p * xmin
-                xmax1 = p * xmax
-                scale1 = (xmax1 - xmin1) / self.maxq
-                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
-                q = self._quantize(
-                    x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq
-                )
-                q -= x
-                q.abs_()
-                q.pow_(self.norm)
-                err = torch.sum(q, 1)
-                tmp = err < best
-                if torch.any(tmp):
-                    best[tmp] = err[tmp]
-                    self.scale[tmp] = scale1[tmp]
-                    self.zero[tmp] = zero1[tmp]
-        if not self.perchannel:
-            if weight:
-                tmp = shape[0]
-            else:
-                tmp = shape[1] if len(shape) != 3 else shape[2]
-            self.scale = self.scale.repeat(tmp)
-            self.zero = self.zero.repeat(tmp)
-
-        if weight:
-            shape = [-1] + [1] * (len(shape) - 1)
-            self.scale = self.scale.reshape(shape)
-            self.zero = self.zero.reshape(shape)
-            return
-        if len(shape) == 4:
-            self.scale = self.scale.reshape((1, -1, 1, 1))
-            self.zero = self.zero.reshape((1, -1, 1, 1))
-        if len(shape) == 3:
-            self.scale = self.scale.reshape((1, 1, -1))
-            self.zero = self.zero.reshape((1, 1, -1))
-        if len(shape) == 2:
-            self.scale = self.scale.unsqueeze(0)
-            self.zero = self.zero.unsqueeze(0)
-
-    def quantize(self, x):
-        if self.ready():
-            return self._quantize(x, self.scale, self.zero, self.maxq)
-
-        return x
-
-    def enabled(self):
-        return self.maxq > 0
-
-    def ready(self):
-        return torch.all(self.scale != 0)
-
-
-class GPTQ:
-    def __init__(self, layer, observe=False):
-        self.layer = layer
-        self.dev = self.layer.weight.device
-        W = layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
-        if isinstance(self.layer, transformers.Conv1D):
-            W = W.t()
-        self.rows = W.shape[0]
-        self.columns = W.shape[1]
-        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
-        self.nsamples = 0
-        self.quantizer = Quantizer()
-        self.observe = observe
-
-    def add_batch(self, inp, out):
-        # Hessian H = 2 X XT + λ I
-        if self.observe:
-            self.inp1 = inp
-            self.out1 = out
-        else:
-            self.inp1 = None
-            self.out1 = None
-
-        if len(inp.shape) == 2:
-            inp = inp.unsqueeze(0)
-        tmp = inp.shape[0]
-        if isinstance(self.layer, nn.Linear) or isinstance(
-            self.layer, transformers.Conv1D
-        ):
-            if len(inp.shape) == 3:
-                inp = inp.reshape((-1, inp.shape[-1]))
-            inp = inp.t()
-        if isinstance(self.layer, nn.Conv2d):
-            unfold = nn.Unfold(
-                self.layer.kernel_size,
-                dilation=self.layer.dilation,
-                padding=self.layer.padding,
-                stride=self.layer.stride,
-            )
-            inp = unfold(inp)
-            inp = inp.permute([1, 0, 2])
-            inp = inp.flatten(1)
-        self.H *= self.nsamples / (self.nsamples + tmp)
-        self.nsamples += tmp
-        # inp = inp.float()
-        inp = math.sqrt(2 / self.nsamples) * inp.float()
-        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
-        self.H += inp.matmul(inp.t())
-
-    def print_loss(self, name, q_weight, weight_error, timecost):
-        table = Texttable()
-        length = 28
-        name = (
-            (name + " " * (length - len(name)))
-            if len(name) <= length
-            else name[:length]
-        )
-
-        table.header(["name", "weight_error", "fp_inp_SNR", "q_inp_SNR", "time"])
-
-        # assign weight
-        self.layer.weight.data = q_weight.reshape(self.layer.weight.shape).to(
-            self.layer.weight.data.dtype
-        )
-
-        if self.inp1 is not None:
-            # quantize input to int8
-            quantizer = Quantizer()
-            quantizer.configure(8, perchannel=False, sym=True, mse=False)
-            quantizer.find_params(self.inp1)
-            q_in = quantizer.quantize(self.inp1).type(torch.float16)
-            q_out = self.layer(q_in)
-
-            # get kinds of SNR
-            q_SNR = torch_snr_error(q_out, self.out1).item()
-            fp_SNR = torch_snr_error(self.layer(self.inp1), self.out1).item()
-        else:
-            q_SNR = "-"
-            fp_SNR = "-"
-
-        table.add_row([name, weight_error, fp_SNR, q_SNR, timecost])
-        print(table.draw().split("\n")[-2])
-
-    def fasterquant(
-        self, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False, name=""
-    ):
-        self.layer.to(self.dev)
-
-        W = self.layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
-        if isinstance(self.layer, transformers.Conv1D):
-            W = W.t()
-        W = W.float()
-
-        tick = time.time()
-
-        if not self.quantizer.ready():
-            self.quantizer.find_params(W, weight=True)
-
-        H = self.H
-        if not self.observe:
-            del self.H
-        dead = torch.diag(H) == 0
-        H[dead, dead] = 1
-        W[:, dead] = 0
-
-        if act_order:
-            perm = torch.argsort(torch.diag(H), descending=True)
-            W = W[:, perm]
-            H = H[perm][:, perm]
-
-        Losses = torch.zeros_like(W)
-        Q = torch.zeros_like(W)
-
-        damp = percdamp * torch.mean(torch.diag(H))
-        diag = torch.arange(self.columns, device=self.dev)
-        H[diag, diag] += damp
-        H = torch.linalg.cholesky(H)
-        H = torch.cholesky_inverse(H)
-        try:
-            H = torch.linalg.cholesky(H, upper=True)
-        except Exception:
-            # Addition because Falcon fails on h_to_4h
-            H = torch.linalg.cholesky(
-                H + 1e-5 * torch.eye(H.shape[0]).to(H.device), upper=True
-            )
-        Hinv = H
-
-        g_idx = []
-        scale = []
-        zero = []
-        now_idx = 1
-
-        for i1 in range(0, self.columns, blocksize):
-            i2 = min(i1 + blocksize, self.columns)
-            count = i2 - i1
-
-            W1 = W[:, i1:i2].clone()
-            Q1 = torch.zeros_like(W1)
-            Err1 = torch.zeros_like(W1)
-            Losses1 = torch.zeros_like(W1)
-            Hinv1 = Hinv[i1:i2, i1:i2]
-
-            for i in range(count):
-                w = W1[:, i]
-                d = Hinv1[i, i]
-
-                if groupsize != -1:
-                    if (i1 + i) % groupsize == 0:
-                        self.quantizer.find_params(
-                            W[:, (i1 + i) : (i1 + i + groupsize)], weight=True
-                        )
-
-                    if ((i1 + i) // groupsize) - now_idx == -1:
-                        scale.append(self.quantizer.scale)
-                        zero.append(self.quantizer.zero)
-                        now_idx += 1
-
-                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
-                Q1[:, i] = q
-                Losses1[:, i] = (w - q) ** 2 / d**2
-
-                err1 = (w - q) / d
-                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
-                Err1[:, i] = err1
-
-            Q[:, i1:i2] = Q1
-            Losses[:, i1:i2] = Losses1 / 2
-
-            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
-
-        torch.cuda.synchronize()
-        error = torch.sum(Losses).item()
-
-        groupsize = groupsize if groupsize != -1 else self.columns
-        g_idx = [i // groupsize for i in range(self.columns)]
-        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
-        if act_order:
-            invperm = torch.argsort(perm)
-            Q = Q[:, invperm]
-            g_idx = g_idx[invperm]
-
-        if isinstance(self.layer, transformers.Conv1D):
-            Q = Q.t()
-
-        self.print_loss(
-            name=name, q_weight=Q, weight_error=error, timecost=(time.time() - tick)
-        )
-
-        if scale == []:
-            scale.append(self.quantizer.scale)
-            zero.append(self.quantizer.zero)
-        scale = torch.cat(scale, dim=1)
-        zero = torch.cat(zero, dim=1)
-        return scale, zero, g_idx, error
-
-    def free(self):
-        self.inp1 = None
-        self.out1 = None
-        self.H = None
-        self.Losses = None
-        self.Trace = None
-        torch.cuda.empty_cache()
-
-
-def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code):
-    from datasets import load_dataset
-
-    traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
-    testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=False, trust_remote_code=trust_remote_code
-        )
-    except:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=True, trust_remote_code=trust_remote_code
-        )
-
-    trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
-    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
-
-    import random
-
-    random.seed(seed)
-    trainloader = []
-    for _ in range(nsamples):
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
-    return trainloader, testenc
-
-
-def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code):
-    from datasets import load_dataset
-
-    traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
-    valdata = load_dataset("ptb_text_only", "penn_treebank", split="validation")
-
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=False, trust_remote_code=trust_remote_code
-        )
-    except:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=True, trust_remote_code=trust_remote_code
-        )
-
-    trainenc = tokenizer("\n\n".join(traindata["sentence"]), return_tensors="pt")
-    testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt")
-
-    import random
-
-    random.seed(seed)
-    trainloader = []
-    for _ in range(nsamples):
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
-    return trainloader, testenc
-
-
-def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code):
-    from datasets import load_dataset
-
-    traindata = load_dataset(
-        "allenai/c4",
-        "allenai--c4",
-        data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
-        split="train",
-        use_auth_token=False,
-    )
-    valdata = load_dataset(
-        "allenai/c4",
-        "allenai--c4",
-        data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
-        split="validation",
-        use_auth_token=False,
-    )
-
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=False, trust_remote_code=trust_remote_code
-        )
-    except:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=True, trust_remote_code=trust_remote_code
-        )
-
-    import random
-
-    random.seed(seed)
-    trainloader = []
-    for _ in range(nsamples):
-        while True:
-            i = random.randint(0, len(traindata) - 1)
-            trainenc = tokenizer(traindata[i]["text"], return_tensors="pt")
-            if trainenc.input_ids.shape[1] >= seqlen:
-                break
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
-
-    import random
-
-    random.seed(0)
-    valenc = []
-    for _ in range(256):
-        while True:
-            i = random.randint(0, len(valdata) - 1)
-            tmp = tokenizer(valdata[i]["text"], return_tensors="pt")
-            if tmp.input_ids.shape[1] >= seqlen:
-                break
-        i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        valenc.append(tmp.input_ids[:, i:j])
-    valenc = torch.hstack(valenc)
-
-    class TokenizerWrapper:
-        def __init__(self, input_ids):
-            self.input_ids = input_ids
-
-    valenc = TokenizerWrapper(valenc)
-
-    return trainloader, valenc
-
-
-def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code):
-    from datasets import load_dataset
-
-    traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
-    testdata = load_dataset("ptb_text_only", "penn_treebank", split="test")
-
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=False, trust_remote_code=trust_remote_code
-        )
-    except:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=True, trust_remote_code=trust_remote_code
-        )
-
-    trainenc = tokenizer(" ".join(traindata["sentence"]), return_tensors="pt")
-    testenc = tokenizer(" ".join(testdata["sentence"]), return_tensors="pt")
-
-    import random
-
-    random.seed(seed)
-    trainloader = []
-    for _ in range(nsamples):
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
-    return trainloader, testenc
-
-
-def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
-    from datasets import load_dataset
-
-    traindata = load_dataset(
-        "allenai/c4",
-        "allenai--c4",
-        data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
-        split="train",
-    )
-    valdata = load_dataset(
-        "allenai/c4",
-        "allenai--c4",
-        data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
-        split="validation",
-    )
-
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=False, trust_remote_code=trust_remote_code
-        )
-    except:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id, use_fast=True, trust_remote_code=trust_remote_code
-        )
-
-    import random
-
-    random.seed(seed)
-    trainloader = []
-    for _ in range(nsamples):
-        while True:
-            i = random.randint(0, len(traindata) - 1)
-            trainenc = tokenizer(traindata[i]["text"], return_tensors="pt")
-            if trainenc.input_ids.shape[1] >= seqlen:
-                break
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
-
-    valenc = tokenizer(" ".join(valdata[:1100]["text"]), return_tensors="pt")
-    valenc = valenc.input_ids[:, : (256 * seqlen)]
-
-    class TokenizerWrapper:
-        def __init__(self, input_ids):
-            self.input_ids = input_ids
-
-    valenc = TokenizerWrapper(valenc)
-
-    return trainloader, valenc
-
-
-def get_loaders(
-    name, nsamples=128, seed=0, seqlen=2048, model_id="", trust_remote_code=False
-):
-    if "wikitext2" in name:
-        return get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code)
-    if "ptb" in name:
-        if "new" in name:
-            return get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code)
-        return get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code)
-    if "c4" in name:
-        if "new" in name:
-            return get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code)
-        return get_c4(nsamples, seed, seqlen, model_id, trust_remote_code)
-
-
-def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
-    # Skip last lm_head linear
-    # Need isintance Falcon is inheriting Linear.
-    if isinstance(module, layers) and "lm_head" not in name:
-        return {name: module}
-    res = {}
-    for name1, child in module.named_children():
-        res.update(
-            find_layers(
-                child, layers=layers, name=name + "." + name1 if name != "" else name1
-            )
-        )
-    return res
-
-
-@torch.no_grad()
-def sequential(
-    model,
-    dataloader,
-    dev,
-    nsamples,
-    bits,
-    groupsize,
-    *,
-    hooks,
-    percdamp=0.01,
-    sym: bool = False,
-    act_order: bool = False,
-):
-    print("Starting ...")
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    try:
-        layers = model.model.layers
-        prefix = "model.layers"
-    except Exception:
-        layers = model.transformer.h
-        prefix = "transformer.h"
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-
-    cache = {"i": 0}
-    extra = {}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-
-        def forward(self, inp, **kwargs):
-            inps[cache["i"]] = inp
-            cache["i"] += 1
-            extra.update(kwargs.copy())
-            raise ValueError
-
-    layers[0] = Catcher(layers[0])
-    for batch in dataloader:
-        try:
-            model(batch[0].cuda())
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-
-    # layers[0] = layers[0].cpu()
-    # model.model.embed_tokens = model.model.embed_tokens.cpu()
-    # model.model.norm = model.model.norm.cpu()
-    torch.cuda.empty_cache()
-    for hook in hooks:
-        hook.remove()
-
-    outs = torch.zeros_like(inps)
-
-    extra = {
-        k: v.to(dev) if isinstance(v, torch.Tensor) else v for k, v in extra.items()
-    }
-
-    print("Ready.")
-
-    quantizers = {}
-    for i in range(len(layers)):
-        print(f"Quantizing layer {i+1}/{len(layers)}..")
-        print("+------------------+--------------+------------+-----------+-------+")
-        print("|       name       | weight_error | fp_inp_SNR | q_inp_SNR | time  |")
-        print("+==================+==============+============+===========+=======+")
-
-        layer = layers[i]
-        layer.load()
-        full = find_layers(layer)
-        sequential = [list(full.keys())]
-
-        for names in sequential:
-            subset = {n: full[n] for n in names}
-            gptq = {}
-            for name in subset:
-                gptq[name] = GPTQ(subset[name])
-                gptq[name].quantizer.configure(
-                    bits, perchannel=True, sym=sym, mse=False
-                )
-                pass
-
-            def add_batch(name):
-                def tmp(_, inp, out):
-                    gptq[name].add_batch(inp[0].data, out.data)
-
-                return tmp
-
-            handles = []
-            for name in subset:
-                handles.append(subset[name].register_forward_hook(add_batch(name)))
-            for j in range(nsamples):
-                outs[j] = layer(inps[j].unsqueeze(0), **extra)[0]
-            for h in handles:
-                h.remove()
-
-            for name in subset:
-                scale, zero, g_idx, error = gptq[name].fasterquant(
-                    percdamp=percdamp,
-                    groupsize=groupsize,
-                    act_order=act_order,
-                    name=name,
-                )
-                quantizers[f"{prefix}.{i}.{name}"] = (
-                    gptq[name].quantizer.cpu(),
-                    scale.cpu(),
-                    zero.cpu(),
-                    g_idx.cpu(),
-                    bits,
-                    groupsize,
-                )
-
-                gptq[name].free()
-
-        for j in range(nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), **extra)[0]
-
-        layer.unload()
-        del layer
-        del gptq
-        torch.cuda.empty_cache()
-
-        inps, outs = outs, inps
-        print("+------------------+--------------+------------+-----------+-------+")
-        print("\n")
-
-    model.config.use_cache = use_cache
-
-    return quantizers
-
-
-def make_quant_linear(module, names, bits, groupsize, name=""):
-    if isinstance(module, QuantLinear):
-        return
-    for attr in dir(module):
-        tmp = getattr(module, attr)
-        name1 = name + "." + attr if name != "" else attr
-        if name1 in names:
-            delattr(module, attr)
-            setattr(
-                module,
-                attr,
-                QuantLinear.new(
-                    bits,
-                    groupsize,
-                    tmp.in_features,
-                    tmp.out_features,
-                    tmp.bias is not None,
-                ),
-            )
-    for name1, child in module.named_children():
-        make_quant_linear(
-            child, names, bits, groupsize, name + "." + name1 if name != "" else name1
-        )
-
-
-# TODO: perform packing on GPU
-def pack(model, quantizers, bits, groupsize):
-    layers = find_layers(model)
-    layers = {n: layers[n] for n in quantizers}
-    make_quant_linear(model, quantizers, bits, groupsize)
-    qlayers = find_layers(model, (QuantLinear,))
-    print("Packing ...")
-    for name in qlayers:
-        print(name)
-        quantizers[name], scale, zero, g_idx, _, _ = quantizers[name]
-        qlayers[name].pack(layers[name], scale, zero, g_idx)
-    print("Done.")
-    return model
-
-
-def setdeepattr(module, full_name, tensor):
-    current = module
-    tokens = full_name.split(".")
-    for token in tokens[:-1]:
-        current = getattr(current, token)
-    setattr(current, tokens[-1], tensor)
-
-
-def getdeepattr(module, full_name):
-    current = module
-    tokens = full_name.split(".")
-    for token in tokens:
-        current = getattr(current, token)
-    return current
-
-
-def load_weights_pre_hook(module_name, weights, recursive=False):
-    def inner(module, args):
-        print(f"Pre hook {module_name}")
-        local_params = {}
-        for k, v in module.named_parameters():
-            if not recursive and k.count(".") != 1:
-                continue
-            local_params[k] = v
-        for k, v in module.named_buffers():
-            if not recursive and k.count(".") != 1:
-                continue
-            local_params[k] = v
-
-        for local_param in local_params:
-            current_tensor = getdeepattr(module, local_param)
-            if current_tensor.device == torch.device("meta"):
-                # print(f"Loading {local_param}")
-                if module_name:
-                    tensor_name = f"{module_name}.{local_param}"
-                else:
-                    tensor_name = local_param
-                tensor = weights.get_tensor(tensor_name)
-                setdeepattr(module, local_param, nn.Parameter(tensor))
-            else:
-                tensor = current_tensor.to(device=torch.device("cuda:0"))
-                if current_tensor.requires_grad:
-                    tensor = nn.Parameter(tensor)
-                setdeepattr(module, local_param, tensor)
-
-    return inner
-
-
-def load_weights_post_hook(module_name, weights, recursive=False):
-    def inner(module, args, output):
-        print(f"Post hook {module_name}")
-        local_params = {}
-        for k, v in module.named_parameters():
-            if not recursive and k.count(".") != 1:
-                continue
-            local_params[k] = v
-        for k, v in module.named_buffers():
-            if not recursive and k.count(".") != 1:
-                continue
-            local_params[k] = v
-        for local_param in local_params:
-            # print(f"Unloading {local_param}")
-            current_tensor = getdeepattr(module, local_param)
-            setdeepattr(
-                module,
-                local_param,
-                nn.Parameter(current_tensor.to(device=torch.device("cpu"))),
-            )
-        return output
-
-    return inner
-
-
-def quantize(
-    model_id: str,
-    bits: int,
-    groupsize: int,
-    output_dir: str,
-    revision: str,
-    trust_remote_code: bool,
-    upload_to_model_id: Optional[str],
-    percdamp: float,
-    act_order: bool,
-):
-    print("loading model")
-    config = AutoConfig.from_pretrained(
-        model_id,
-        trust_remote_code=trust_remote_code,
-    )
-
-    with init_empty_weights():
-        model = AutoModelForCausalLM.from_config(
-            config, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
-        )
-    model = model.eval()
-
-    print("LOADED model")
-    files = weight_files(model_id, revision, extension=".safetensors")
-    process_group, _, _ = initialize_torch_distributed()
-    weights = Weights(
-        files,
-        device=torch.device("cuda:0"),
-        dtype=torch.float16,
-        process_group=process_group,
-        aliases={"embed_tokens.weight": ["lm_head.weight"]},
-    )
-    hooks = []
-    for name, module in model.named_modules():
-
-        def load(module, name):
-            def _load():
-                load_weights_pre_hook(name, weights, recursive=True)(module, None)
-
-            return _load
-
-        def unload(module, name):
-            def _unload():
-                load_weights_post_hook(name, weights, recursive=True)(
-                    module, None, None
-                )
-
-            return _unload
-
-        module.load = load(module, name)
-        module.unload = unload(module, name)
-        hooks.append(
-            module.register_forward_pre_hook(load_weights_pre_hook(name, weights))
-        )
-        hooks.append(
-            module.register_forward_hook(load_weights_post_hook(name, weights))
-        )
-    model.seqlen = 2048
-
-    dataset = "wikitext2"
-    nsamples = 128
-    seed = None
-
-    dataloader, testloader = get_loaders(
-        dataset,
-        nsamples=nsamples,
-        seed=seed,
-        model_id=model_id,
-        seqlen=model.seqlen,
-        trust_remote_code=trust_remote_code,
-    )
-
-    tick = time.time()
-    quantizers = sequential(
-        model,
-        dataloader,
-        DEV,
-        nsamples,
-        bits,
-        groupsize,
-        percdamp=percdamp,
-        act_order=act_order,
-        hooks=hooks,
-    )
-    print(time.time() - tick)
-
-    pack(model, quantizers, bits, groupsize)
-    from safetensors.torch import save_file
-    from transformers.modeling_utils import shard_checkpoint
-
-    state_dict = model.state_dict()
-    state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}
-    state_dict["gptq_bits"] = torch.LongTensor([bits])
-    state_dict["gptq_groupsize"] = torch.LongTensor([groupsize])
-
-    max_shard_size = "10GB"
-    shards, index = shard_checkpoint(
-        state_dict, max_shard_size=max_shard_size, weights_name="model.safetensors"
-    )
-    os.makedirs(output_dir, exist_ok=True)
-    for shard_file, shard in shards.items():
-        save_file(
-            shard,
-            os.path.join(output_dir, shard_file),
-            metadata={
-                "format": "pt",
-                "quantized": "gptq",
-                "origin": "text-generation-inference",
-            },
-        )
-    if index is None:
-        path_to_weights = os.path.join(output_dir, "model.safetensors")
-        logger.info(f"Model weights saved in {path_to_weights}")
-    else:
-        save_index_file = "model.safetensors.index.json"
-        save_index_file = os.path.join(output_dir, save_index_file)
-        with open(save_index_file, "w", encoding="utf-8") as f:
-            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-            f.write(content)
-        logger.info(
-            f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
-            f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
-            f"index located at {save_index_file}."
-        )
-    config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
-    config.save_pretrained(output_dir)
-    logger.info("Saved config")
-    logger.info("Saving tokenizer")
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_id, trust_remote_code=trust_remote_code
-    )
-    tokenizer.save_pretrained(output_dir)
-    logger.info("Saved tokenizer")
-
-    if upload_to_model_id:
-        api = HfApi()
-
-        api.upload_folder(
-            folder_path=output_dir, repo_id=upload_to_model_id, repo_type="model"
-        )
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
-import os
-import torch
-import torch.distributed
-
-from torch import nn
-from torch.nn import functional as F
-from typing import List, Tuple, Optional
-from loguru import logger
-from functools import lru_cache
-
-from text_generation_server.utils.speculate import get_speculate
-
-HAS_BITS_AND_BYTES = True
-try:
-    import bitsandbytes as bnb
-    from bitsandbytes.nn import Int8Params, Params4bit
-except ImportError:
-    HAS_BITS_AND_BYTES = False
-
-from accelerate import init_empty_weights
-
-from text_generation_server.utils.gptq.quant_linear import QuantLinear
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
-
-if IS_XPU_SYSTEM:
-    import intel_extension_for_pytorch as ipex
-
-HAS_AWQ = True
-try:
-    from text_generation_server.utils.awq.quantize.qmodule import WQLinear
-except ImportError:
-    HAS_AWQ = False
-
-try:
-    major, _minor = torch.cuda.get_device_capability()
-except Exception:
-    major = 1
-
-HAS_EXLLAMA = False
-CAN_EXLLAMA = major >= 8 or IS_ROCM_SYSTEM
-V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
-
-if os.getenv("DISABLE_EXLLAMA") == "True":
-    HAS_EXLLAMA = False
-elif CAN_EXLLAMA:
-    try:
-        if V2:
-            from text_generation_server.utils.gptq.exllamav2 import (
-                QuantLinear as ExllamaQuantLinear,
-                create_exllama_buffers,
-                set_device,
-            )
-
-            HAS_EXLLAMA = "2"
-        else:
-            from text_generation_server.utils.gptq.exllama import (
-                Ex4bitLinear as ExllamaQuantLinear,
-                create_exllama_buffers,
-                set_device,
-            )
-
-            HAS_EXLLAMA = "1"
-
-    except ImportError:
-        pass
-
-HAS_EETQ = False
-try:
-    from EETQ import quant_weights, w8_a16_gemm
-
-    HAS_EETQ = True
-except ImportError:
-    pass
-
-
-# Monkey patching
-@classmethod
-def load_layer_norm(cls, prefix, weights, eps):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    bias = weights.get_tensor(f"{prefix}.bias")
-    with init_empty_weights():
-        ln = cls(weight.shape, eps=eps)
-
-    ln.weight = nn.Parameter(weight)
-    ln.bias = nn.Parameter(bias)
-    return ln
-
-
-@classmethod
-def load_layer_norm_no_bias(cls, prefix, weights, eps):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    with init_empty_weights():
-        ln = cls(weight.shape, eps=eps)
-
-    ln.weight = nn.Parameter(weight)
-    ln.bias = None
-    return ln
-
-
-@classmethod
-def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    bias = weights.get_tensor(f"{prefix}.bias")
-    with init_empty_weights():
-        conv2d = cls(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-        )
-
-    conv2d.weight = nn.Parameter(weight)
-    conv2d.bias = nn.Parameter(bias)
-    return conv2d
-
-
-@classmethod
-def load_conv2d_no_bias(
-    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
-):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    with init_empty_weights():
-        conv2d = cls(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-        )
-
-    conv2d.weight = nn.Parameter(weight)
-    conv2d.bias = None
-    return conv2d
-
-
-torch.nn.Conv2d.load = load_conv2d
-torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
-torch.nn.LayerNorm.load = load_layer_norm
-torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
-
-
-class FastLinear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(weight)
-        if bias is not None:
-            self.bias = nn.Parameter(bias)
-        else:
-            self.bias = None
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_tensor(f"{prefix}.weight")
-        if bias:
-            bias = weights.get_tensor(f"{prefix}.bias")
-        else:
-            bias = None
-        return cls(weight, bias)
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, self.weight, self.bias)
-
-
-class EETQLinear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        device = weight.device
-        if weight.dtype != torch.float16:
-            weight = weight.to(dtype=torch.float16)
-        weight = torch.t(weight).contiguous().cpu()
-        weight, scale = quant_weights(weight, torch.int8, False)
-
-        self.weight = weight.cuda(device)
-        self.scale = scale.cuda(device)
-        self.bias = bias.cuda(device) if bias is not None else None
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        output = w8_a16_gemm(input, self.weight, self.scale)
-        output = output + self.bias if self.bias is not None else output
-        return output
-
-
-def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
-    device = weight.device
-    # weight, scale = quant_weights(weight, torch.int8, False)
-    finfo = torch.finfo(qdtype)
-    # Calculate the scale as dtype max divided by absmax
-    scale = finfo.max / weight.abs().max().clamp(min=1e-12)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
-    # Return both float8 data and the inverse scale (as float),
-    # as both required as inputs to torch._scaled_mm
-    qweight = qweight.to(qdtype)
-    scale = scale.float().reciprocal()
-    return qweight, scale
-
-
-class Fp8Linear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        self.dtype = weight.dtype
-        self.qweight, self.scale = fp8_quantize(weight)
-
-        self.bias = bias if bias is not None else None
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        qinput, scale = fp8_quantize(input)
-        output, _ = torch._scaled_mm(
-            qinput,
-            self.qweight.t(),
-            out_dtype=self.dtype,
-            scale_a=scale,
-            scale_b=self.scale,
-            bias=self.bias,
-        )
-        return output
-
-
-class Linear8bitLt(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-        has_fp16_weights=True,
-        memory_efficient_backward=False,
-        threshold=0.0,
-        index=None,
-    ):
-        super().__init__()
-        assert (
-            not memory_efficient_backward
-        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
-        self.state = bnb.MatmulLtState()
-        self.index = index
-
-        # Necessary for stacked layers
-        self.state.threshold = threshold
-        self.state.has_fp16_weights = has_fp16_weights
-        self.state.memory_efficient_backward = memory_efficient_backward
-        if threshold > 0.0 and not has_fp16_weights:
-            self.state.use_pool = True
-
-        self.weight = Int8Params(
-            weight.data,
-            has_fp16_weights=has_fp16_weights,
-            requires_grad=has_fp16_weights,
-        )
-        self.weight.cuda(weight.device)
-        self.bias = bias
-
-    def init_8bit_state(self):
-        self.state.CB = self.weight.CB
-        self.state.SCB = self.weight.SCB
-        self.weight.CB = None
-        self.weight.SCB = None
-
-    def forward(self, x: torch.Tensor):
-        self.state.is_training = self.training
-        if self.weight.CB is not None:
-            self.init_8bit_state()
-
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
-
-        if not self.state.has_fp16_weights:
-            if self.state.CB is not None and self.state.CxB is not None:
-                # we converted 8-bit row major to turing/ampere format in the first inference pass
-                # we no longer need the row-major weight
-                del self.state.CB
-                self.weight.data = self.state.CxB
-        return out
-
-
-class Linear4bit(nn.Module):
-    def __init__(self, weight, bias, quant_type):
-        super().__init__()
-        self.weight = Params4bit(
-            weight.data,
-            requires_grad=False,
-            compress_statistics=True,
-            quant_type=quant_type,
-        )
-        self.compute_dtype = None
-        self.weight.cuda(weight.device)
-        self.bias = bias
-
-    def forward(self, x: torch.Tensor):
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        if getattr(self.weight, "quant_state", None) is None:
-            print(
-                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
-            )
-        inp_dtype = x.dtype
-        if self.compute_dtype is not None:
-            x = x.to(self.compute_dtype)
-
-        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        out = bnb.matmul_4bit(
-            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
-        )
-
-        out = out.to(inp_dtype)
-
-        return out
-
-
-@lru_cache(1)
-def warn_deprecate_bnb():
-    logger.warning(
-        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
-    )
-
-
-def get_linear(weight, bias, quantize):
-    if quantize is None:
-        linear = FastLinear(weight, bias)
-    elif quantize == "eetq":
-        if HAS_EETQ:
-            linear = EETQLinear(weight, bias)
-        else:
-            raise ImportError(
-                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
-            )
-    elif quantize == "fp8":
-        linear = Fp8Linear(weight, bias)
-    elif quantize == "bitsandbytes":
-        warn_deprecate_bnb()
-        linear = Linear8bitLt(
-            weight,
-            bias,
-            has_fp16_weights=False,
-            threshold=6.0,
-        )
-        if bias is not None:
-            linear.bias = nn.Parameter(bias)
-    elif quantize == "bitsandbytes-fp4":
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="fp4",
-        )
-    elif quantize == "bitsandbytes-nf4":
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="nf4",
-        )
-    elif quantize == "gptq":
-        try:
-            qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight
-        except Exception:
-            raise NotImplementedError(
-                f"The passed weight is not `gptq` compatible, loader needs to be updated."
-            )
-
-        if use_exllama:
-            linear = ExllamaQuantLinear(
-                qweight, qzeros, scales, g_idx, bias, bits, groupsize
-            )
-        else:
-            linear = QuantLinear(
-                qweight,
-                qzeros,
-                scales,
-                g_idx,
-                bias,
-                bits,
-                groupsize,
-            )
-    elif quantize == "awq":
-        try:
-            qweight, qzeros, scales, _, bits, groupsize, _ = weight
-        except Exception:
-            raise NotImplementedError(
-                f"The passed weight is not `awq` compatible, loader needs to be updated."
-            )
-        if IS_ROCM_SYSTEM:
-            raise NotImplementedError(
-                "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
-                "to use Exllama/GPTQ kernels for AWQ inference."
-            )
-        if not HAS_AWQ:
-            raise NotImplementedError(
-                "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
-            )
-        linear = WQLinear(
-            w_bit=bits,
-            group_size=groupsize,
-            qweight=qweight,
-            qzeros=qzeros,
-            scales=scales,
-            bias=bias is not None,
-        )
-    else:
-        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
-    return linear
-
-
-class SuperLayer(nn.Module):
-    def __init__(self, linear):
-        super().__init__()
-        self.linear = linear
-
-    def forward(self, x):
-        return self.linear.forward(x)
-
-
-class ResBlock(torch.nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        self.linear = FastLinear.load(
-            config, prefix=f"{prefix}.linear", weights=weights, bias=True
-        )
-        self.act = torch.nn.SiLU()
-
-    def forward(self, x):
-        return x + self.act(self.linear(x))
-
-
-class MedusaModel(torch.nn.Module):
-    def __init__(self, config, medusa_config, weights):
-        super().__init__()
-        self.heads = torch.nn.ModuleList(
-            [
-                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
-                for i in range(get_speculate())
-            ]
-        )
-
-    def forward(self, x):
-        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
-        return speculative_logits
-
-
-class MedusaHead(torch.nn.Module):
-    def __init__(self, config, medusa_config, prefix, weights):
-        super().__init__()
-        self.blocks = torch.nn.ModuleList(
-            [
-                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
-                for i in range(medusa_config["medusa_num_layers"])
-            ]
-        )
-        n = len(self.blocks)
-        self.out = FastLinear.load(
-            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
-        )
-
-    def forward(self, x):
-        for block in self.blocks:
-            x = block(x)
-        x = self.out(x)
-        return x
-
-
-class MedusaHeadV1(nn.Module):
-    def __init__(self, lm_head, medusa):
-        super().__init__()
-        self.lm_head = lm_head
-        self.medusa = medusa
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        from pathlib import Path
-        from safetensors import safe_open
-        import json
-
-        use_medusa = config.use_medusa
-
-        medusa_config = str(Path(use_medusa) / "config.json")
-        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
-
-        with open(medusa_config, "r") as f:
-            medusa_config = json.load(f)
-        routing = weights.routing
-        with safe_open(filename, framework="pytorch") as f:
-            for k in f.keys():
-                if k in routing and routing[k] != filename:
-                    raise RuntimeError(
-                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
-                    )
-                routing[k] = filename
-
-        medusa = MedusaModel(config, medusa_config, weights)
-        lm_head = TensorParallelHead.load(config, prefix, weights)
-        return MedusaHeadV1(lm_head, medusa)
-
-    def forward(
-        self, input: torch.Tensor
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        logits = self.lm_head(input)
-        # If we have too many tokens, we skip speculative logits
-        if input.shape[0] > 128:
-            return logits, None
-
-        speculative_logits = self.medusa(input)
-        return logits, speculative_logits
-
-
-class MedusaHeadV2(nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        from pathlib import Path
-        from safetensors import safe_open
-        import json
-
-        use_medusa = config.use_medusa
-
-        medusa_config = str(Path(use_medusa) / "config.json")
-        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
-
-        with open(medusa_config, "r") as f:
-            medusa_config = json.load(f)
-        routing = weights.routing
-        with safe_open(filename, framework="pytorch") as f:
-            for k in f.keys():
-                if k in routing and routing[k] != filename:
-                    raise RuntimeError(
-                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
-                    )
-                routing[k] = filename
-
-        self.n_medusa_heads = get_speculate()
-
-        assert medusa_config["medusa_num_layers"] == 1
-        self.linear = TensorParallelColumnLinear.load_multi(
-            config,
-            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
-            dim=0,
-            weights=weights,
-            bias=True,
-        )
-        self.process_group = weights.process_group
-        self.world_size = self.process_group.size()
-        self.rank = self.process_group.rank()
-
-        self.act = torch.nn.SiLU()
-
-        self.lm_head = TensorParallelHead.load(config, prefix, weights)
-
-    def forward(self, x):
-        # If we have too many tokens, we skip speculative logits
-        if x.shape[0] > 128:
-            logits = self.lm_head(x)
-            return logits, None
-
-        size = x.shape[-1]
-        block_size = (size + self.world_size - 1) // self.world_size
-        start = self.rank * block_size
-        stop = (self.rank + 1) * block_size
-
-        x_block = x[:, start:stop]
-
-        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
-        medusa_res = self.act(self.linear(x)).reshape(
-            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
-        )
-
-        # Apply all residual medusa heads
-        output = x[:, start:stop].unsqueeze(-2) + medusa_res
-
-        # Gather medusa heads
-        world_output = [
-            torch.empty_like(output) for _ in range(self.process_group.size())
-        ]
-        torch.distributed.all_gather(world_output, output, group=self.process_group)
-        world_output = torch.cat(world_output, dim=-1)
-
-        # Stack x and medusa residual x
-        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
-
-        # Compute lm head on x + medusa residual x
-        logits = self.lm_head(stacked_x)
-
-        # Finally, split logits from speculative logits
-        logits, speculative_logits = torch.split(
-            logits, [1, self.n_medusa_heads], dim=-2
-        )
-        # Squeeze added dimension
-        logits = logits.squeeze(-2)
-
-        return logits, speculative_logits
-
-
-class SpeculativeHead(nn.Module):
-    def __init__(self, lm_head, medusa):
-        super().__init__()
-        self.head = lm_head
-        self.medusa = medusa
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        use_medusa = config.use_medusa
-        if use_medusa:
-            lm_head = None
-            try:
-                medusa = MedusaHeadV1.load(config, prefix, weights)
-            except:
-                medusa = MedusaHeadV2(config, prefix, weights)
-        else:
-            lm_head = TensorParallelHead.load(config, prefix, weights)
-            medusa = None
-        return SpeculativeHead(lm_head, medusa)
-
-    def forward(
-        self, input: torch.Tensor
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if self.medusa is not None:
-            return self.medusa(input)
-
-        assert self.head is not None
-        logits = self.head(input)
-        return logits, None
-
-
-class TensorParallelHead(SuperLayer):
-    def __init__(self, linear, process_group, should_gather: bool):
-        super().__init__(linear)
-        self.process_group = process_group
-        self.should_gather = should_gather
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        if weights.process_group.size() > 1:
-            try:
-                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
-                should_gather = True
-            except AssertionError:
-                # If the vocab size is not divisible by number of shards
-                # just load the entire thing.
-                weight = weights.get_tensor(f"{prefix}.weight")
-                should_gather = False
-        else:
-            weight = weights.get_tensor(f"{prefix}.weight")
-            should_gather = False
-
-        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            quantize = None
-        else:
-            quantize = config.quantize
-        return TensorParallelHead(
-            get_linear(weight, bias=None, quantize=quantize),
-            process_group=weights.process_group,
-            should_gather=should_gather,
-        )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if not self.should_gather:
-            return super().forward(input)
-
-        world_size = self.process_group.size()
-        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
-            out_dim = self.linear.weight.shape[0]
-
-            if input.shape[0] == 1:
-                world_out = input.new_empty(1, out_dim * world_size)
-                local_out = input.new_empty(1, out_dim)
-                gather_input = local_out
-            else:
-                world_out = input.new_empty(out_dim * world_size, input.shape[0])
-                gather_input = input.new_empty(out_dim, input.shape[0])
-                local_out = gather_input.T
-
-            torch.mm(input, self.linear.weight.T, out=local_out)
-
-            torch.distributed.all_gather_into_tensor(
-                world_out, gather_input, group=self.process_group
-            )
-
-            if input.shape[0] == 1:
-                return world_out
-            return world_out.T
-
-        output = super().forward(input)
-        world_output = [
-            torch.empty_like(output) for _ in range(self.process_group.size())
-        ]
-        torch.distributed.all_gather(world_output, output, group=self.process_group)
-        world_output = torch.cat(world_output, dim=-1)
-        return world_output
-
-
-class TensorParallelColumnLinear(SuperLayer):
-    @classmethod
-    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
-        """Specific method when the QKV was joined after the fact"""
-        weight = weights.get_weights_col_packed_gate_up(
-            prefix, quantize=config.quantize
-        )
-        if bias:
-            raise NotImplementedError("packed_gate_up only implemented without bias")
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-    @classmethod
-    def load_qkv(cls, config, prefix: str, weights, bias: bool):
-        """Specific method when the QKV was joined after the fact"""
-        weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize)
-        if bias:
-            raise NotImplementedError("packed_qkv only implemented for baichuan")
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        return cls.load_multi(config, [prefix], weights, bias, dim=0)
-
-    @classmethod
-    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
-        weight = weights.get_multi_weights_col(
-            prefixes, quantize=config.quantize, dim=dim
-        )
-
-        if bias:
-            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
-            bias = torch.cat(b, dim=dim)
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-
-class TensorParallelRowLinear(SuperLayer):
-    def __init__(self, linear, process_group):
-        super().__init__(linear)
-        self.process_group = process_group
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
-
-        if bias and weights.process_group.rank() == 0:
-            # Rank is only on the first rank process
-            bias = weights.get_tensor(f"{prefix}.bias")
-        else:
-            bias = None
-        return cls(
-            get_linear(weight, bias, config.quantize),
-            process_group=weights.process_group,
-        )
-
-    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
-        out = super().forward(input)
-        if self.process_group.size() > 1 and reduce:
-            torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-class TensorParallelEmbedding(nn.Module):
-    def __init__(self, prefix: str, weights, reduce=True):
-        super().__init__()
-        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
-        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
-
-        process_group = weights.process_group
-
-        world_size = process_group.size()
-        rank = process_group.rank()
-
-        block_size = (num_embeddings + world_size - 1) // world_size
-        self.min_id = rank * block_size
-        self.max_id = min(num_embeddings, (rank + 1) * block_size)
-        self.null_idx = weight.shape[
-            0
-        ]  # Usually block_size, might be less in non even vocab_size.
-        self.process_group = weights.process_group
-        self.reduce = reduce
-
-        """Additional 0 entry used for masking"""
-        self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
-        # translate for [0, self.max_id - self.min_id[
-        input = torch.where(
-            (self.min_id > input) | (input >= self.max_id),
-            self.null_idx,
-            input - self.min_id,
-        )
-        out = torch.nn.functional.embedding(input, self.weight)
-        if self.reduce and self.process_group.size() > 1:
-            torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-try:
-    if IS_CUDA_SYSTEM:
-        import dropout_layer_norm
-    elif IS_ROCM_SYSTEM:
-        from vllm import layernorm_ops
-    else:
-        dropout_layer_norm = None
-
-    class FastLayerNorm(nn.LayerNorm):
-        def forward(self, hidden_states, residual=None):
-            if IS_XPU_SYSTEM:
-                res_out = hidden_states
-                out = ipex.llm.functional.add_layer_norm(
-                    residual, hidden_states, self.weight, self.bias, self.eps, True
-                )
-                if residual is not None:
-                    res_out = residual
-                return out, res_out
-            elif hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                return super(FastLayerNorm, self).forward(hidden_states), residual
-            else:
-                (
-                    normed_hidden_states,
-                    residual,
-                    *rest,
-                ) = dropout_layer_norm.dropout_add_ln_fwd(
-                    hidden_states,
-                    residual,
-                    self.weight,
-                    self.bias,
-                    None,
-                    None,
-                    None,
-                    None,
-                    0.0,
-                    self.eps,
-                    1.0,
-                    0,
-                    None,
-                    False,
-                    False,
-                )
-                if residual is None:
-                    residual = hidden_states
-
-                return normed_hidden_states, residual
-
-    class FastRMSNorm(nn.Module):
-        def __init__(self, weight: torch.Tensor, eps: float):
-            super().__init__()
-
-            self.weight = nn.Parameter(weight)
-            self.variance_epsilon = eps
-
-        @classmethod
-        def load(cls, prefix, weights, eps=1e-6):
-            weight = weights.get_tensor(f"{prefix}.weight")
-            return cls(weight, eps)
-
-        def forward(self, hidden_states, residual=None):
-            if IS_XPU_SYSTEM:
-                residual_out = hidden_states
-                out = ipex.llm.functional.add_rms_norm(
-                    residual,
-                    hidden_states,
-                    self.weight,
-                    None,
-                    self.variance_epsilon,
-                    True,
-                )
-                if residual is not None:
-                    residual_out = residual
-                return out, residual_out
-            elif hidden_states.shape[-1] > 8192:
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                hidden_states = hidden_states.to(torch.float32)
-                variance = hidden_states.pow(2).mean(-1, keepdim=True)
-                hidden_states = hidden_states * torch.rsqrt(
-                    variance + self.variance_epsilon
-                )
-
-                # convert into half-precision if necessary
-                if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                    hidden_states = hidden_states.to(self.weight.dtype)
-
-                return self.weight * hidden_states, residual
-            elif IS_CUDA_SYSTEM:
-                # faster post attention rms norm
-                (
-                    normed_hidden_states,
-                    res,
-                    *rest,
-                ) = dropout_layer_norm.dropout_add_ln_fwd(
-                    hidden_states,
-                    residual,
-                    self.weight,
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                    0.0,
-                    self.variance_epsilon,
-                    1.0,
-                    0,
-                    None,
-                    False,
-                    True,  # Activate RMSNorm
-                )
-                if res is None:
-                    res = hidden_states
-
-                return normed_hidden_states, res
-            elif IS_ROCM_SYSTEM:
-                # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                out = torch.empty_like(hidden_states)
-                layernorm_ops.rms_norm(
-                    out,
-                    hidden_states,
-                    self.weight.data,
-                    self.variance_epsilon,
-                )
-                return out, residual
-            else:
-                raise ValueError(
-                    "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-                )
-
-except ImportError:
-    pass
-
-try:
-    if IS_CUDA_SYSTEM:
-        from flash_attn.layers.rotary import RotaryEmbedding
-        import rotary_emb
-    elif IS_ROCM_SYSTEM:
-        from vllm import pos_encoding_ops
-
-    def _create_inv_freq(dim, base, device):
-        inv_freq = 1.0 / (
-            base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
-        )
-        return inv_freq
-
-    def _get_rope_config(config):
-        if os.getenv("ROPE_SCALING", None) is not None:
-            rope_scaling = {
-                "type": os.environ["ROPE_SCALING"],
-                "factor": float(os.environ["ROPE_FACTOR"]),
-            }
-            return rope_scaling
-        return getattr(config, "rope_scaling", None)
-
-    class PositionRotaryEmbedding(nn.Module):
-        def __init__(self, inv_freq, scaling_factor):
-            super().__init__()
-            self.inv_freq = inv_freq
-            self._seq_len_cached = 0
-            self._cos_cached = None
-            self._sin_cached = None
-            self._cos_k_cached = None
-            self._sin_k_cached = None
-            self.scaling_factor = scaling_factor
-            self.dynamic_args = None
-
-        def forward(
-            self,
-            query: torch.Tensor,
-            key: torch.Tensor,
-            cos: torch.Tensor,
-            sin: torch.Tensor,
-        ):
-            # Such controlflows may add some overhead.
-            if IS_CUDA_SYSTEM:
-                rotary_dim = cos.shape[-1]
-                q1 = query[..., :rotary_dim]
-                q2 = query[..., rotary_dim : 2 * rotary_dim]
-
-                rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
-
-                k1 = key[..., :rotary_dim]
-                k2 = key[..., rotary_dim : 2 * rotary_dim]
-
-                rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-            elif IS_ROCM_SYSTEM:
-                # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
-                # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
-
-                head_size = query.shape[-1]
-
-                # Inplace operation, updating query and key.
-                pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True)
-            elif IS_XPU_SYSTEM:
-                ipex.llm.functional.rotary_embedding(
-                    query, key, sin, cos, query.size(-1), True
-                )
-            else:
-                raise ValueError(
-                    "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-                )
-
-        @classmethod
-        def static(cls, config, dim, base, device):
-            inv_freq = _create_inv_freq(dim, base, device)
-            scaling_factor = None
-            rope_scaling = _get_rope_config(config)
-            if rope_scaling is not None:
-                scaling_factor = rope_scaling["factor"]
-                if rope_scaling["type"] == "linear":
-                    pass
-                elif rope_scaling["type"] == "dynamic":
-                    return DynamicPositionRotaryEmbedding(
-                        dim=dim,
-                        max_position_embeddings=config.max_position_embeddings,
-                        base=base,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                    )
-                elif rope_scaling["type"] == "yarn":
-                    return YarnPositionRotaryEmbedding(
-                        dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=rope_scaling[
-                            "original_max_position_embeddings"
-                        ],
-                        base=10000.0,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                        extrapolation_factor=1,
-                        attn_factor=1,
-                        beta_fast=32,
-                        beta_slow=1,
-                    )
-                else:
-                    raise NotImplementedError(
-                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
-                    )
-            return cls(inv_freq, scaling_factor)
-
-        @classmethod
-        def load(cls, config, prefix, weights):
-            # XXX: Always load this in float32 !
-            dtype = weights.dtype
-            weights.dtype = torch.float32
-            inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
-            weights.dtype = dtype
-
-            scaling_factor = None
-            rope_scaling = _get_rope_config(config)
-            if rope_scaling is not None:
-                scaling_factor = rope_scaling["factor"]
-                if rope_scaling["type"] == "linear":
-                    pass
-                elif rope_scaling["type"] == "dynamic":
-                    return DynamicPositionRotaryEmbedding(
-                        dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=config.max_position_embeddings,
-                        base=10000.0,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                    )
-                elif rope_scaling["type"] == "yarn":
-                    return YarnPositionRotaryEmbedding(
-                        dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=rope_scaling[
-                            "original_max_position_embeddings"
-                        ],
-                        base=10000.0,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                        extrapolation_factor=1,
-                        attn_factor=1,
-                        beta_fast=32,
-                        beta_slow=1,
-                    )
-                else:
-                    raise NotImplementedError(
-                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
-                    )
-            return cls(inv_freq, scaling_factor)
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                if self.scaling_factor is not None:
-                    t /= self.scaling_factor
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-        def get_cos_sin(
-            self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
-        ):
-            """
-            Return cos and sin for the asked position ids
-            """
-            if IS_ROCM_SYSTEM:
-                # For RoCm, we always use float cos/sin to avoid a cast.
-                # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
-                # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
-                dtype = torch.float32
-
-            self._update_cos_sin_cache(dtype, position_ids.device, max_s)
-
-            cos = torch.index_select(self._cos_cached, 0, position_ids)
-            sin = torch.index_select(self._sin_cached, 0, position_ids)
-
-            # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
-            return cos.unsqueeze(1), sin.unsqueeze(1)
-
-    class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
-            inv_freq = _create_inv_freq(dim, base, device)
-            super().__init__(inv_freq, scaling_factor)
-            self.dim = dim
-            self.max_position_embeddings = max_position_embeddings
-            self.base = base
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                if seqlen > self.max_position_embeddings:
-                    newbase = self.base * (
-                        (self.scaling_factor * seqlen / self.max_position_embeddings)
-                        - (self.scaling_factor - 1)
-                    ) ** (self.dim / (self.dim - 2))
-                    self.inv_freq = _create_inv_freq(
-                        self.dim, newbase, self.inv_freq.device
-                    )
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-    # Inverse dim formula to find dim based on number of rotations
-    import math
-
-    def find_correction_dim(
-        num_rotations, dim, base=10000, max_position_embeddings=2048
-    ):
-        return (
-            dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))
-        ) / (2 * math.log(base))
-
-    # Find dim range bounds based on rotations
-    def find_correction_range(
-        low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
-    ):
-        low = math.floor(
-            find_correction_dim(low_rot, dim, base, max_position_embeddings)
-        )
-        high = math.ceil(
-            find_correction_dim(high_rot, dim, base, max_position_embeddings)
-        )
-        return max(low, 0), min(high, dim - 1)  # Clamp values just in case
-
-    def linear_ramp_mask(min, max, dim):
-        if min == max:
-            max += 0.001  # Prevent singularity
-
-        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
-        ramp_func = torch.clamp(linear_func, 0, 1)
-        return ramp_func
-
-    def get_mscale(scale=1):
-        if scale <= 1:
-            return 1.0
-        return 0.1 * math.log(scale) + 1.0
-
-    class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(
-            self,
-            dim,
-            max_position_embeddings,
-            base,
-            device,
-            scaling_factor,
-            *,
-            extrapolation_factor,
-            attn_factor,
-            beta_fast,
-            beta_slow,
-        ):
-            inv_freq = _create_inv_freq(dim, base, device)
-            super().__init__(inv_freq, scaling_factor)
-            self.dim = dim
-            self.max_position_embeddings = max_position_embeddings
-            self.base = base
-            self.extrapolation_factor = extrapolation_factor
-            self.attn_factor = attn_factor
-            self.beta_fast = beta_fast
-            self.beta_slow = beta_slow
-            self.mscale = float(
-                get_mscale(self.scaling_factor) * self.attn_factor
-            )  # Get n-d magnitude scaling corrected for interpolation
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                if seqlen > self.max_position_embeddings:
-                    inv_freq_extrapolation = _create_inv_freq(
-                        self.dim, self.base, self.inv_freq.device
-                    )
-                    freqs = 1.0 / inv_freq_extrapolation
-                    inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
-                    low, high = find_correction_range(
-                        self.beta_fast,
-                        self.beta_slow,
-                        self.dim,
-                        self.base,
-                        self.max_position_embeddings,
-                    )
-                    inv_freq_mask = (
-                        1
-                        - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
-                    ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
-                    inv_freq = (
-                        inv_freq_interpolation * (1 - inv_freq_mask)
-                        + inv_freq_extrapolation * inv_freq_mask
-                    )
-
-                    self.inv_freq = inv_freq
-                    self.mscale = float(
-                        get_mscale(self.scaling_factor) * self.attn_factor
-                    )  # Get n-d magnitude scaling corrected for interpolation
-
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
-                self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
-
-except ImportError:
-    pass
--- a/server/text_generation_server/utils/paged_attention.py
+++ b/server/text_generation_server/utils/paged_attention.py
-import torch
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
-
-_PARTITION_SIZE = 512
-
-if IS_XPU_SYSTEM:
-    import intel_extension_for_pytorch as ipex
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    if IS_CUDA_SYSTEM:
-        from vllm._C import cache_ops
-
-        cache_ops.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, "auto", 1.0
-        )
-    elif IS_ROCM_SYSTEM:
-        from vllm import cache_ops
-
-        # cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots)
-        cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots.int())
-    elif IS_XPU_SYSTEM:
-        ipex.llm.modules.PagedAttention.reshape_and_cache(
-            key, value, key_cache, value_cache, slots
-        )
-    else:
-        raise ValueError("vllm is not supported on your system")
-
-
-def attention(
-    out: torch.Tensor,
-    query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    kv_head_mapping: torch.Tensor,
-    softmax_scale: float,
-    block_tables: torch.Tensor,
-    input_lengths: torch.Tensor,
-    max_s: int,
-):
-    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
-    # Copyright 2023 The vLLM team. All rights
-    # reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    #
-
-    # value_cache => [num_blocks, num_heads, head_size, block_size]
-    block_size = value_cache.shape[3]
-    num_seqs, num_heads, head_size = query.shape
-    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-    if IS_XPU_SYSTEM:
-        query = query.contiguous()
-        return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
-            out,
-            query,
-            key_cache,
-            value_cache,
-            kv_head_mapping,
-            softmax_scale,
-            block_tables,
-            input_lengths,
-            block_size,
-            max_s,
-            None,
-        )
-
-    # NOTE(woosuk): We use a simple heuristic to decide whether to use
-    # PagedAttention V1 or V2. If the number of partitions is 1, we use
-    # V1 to avoid the overhead of reduction. Also, if the number of
-    # sequences or heads is large, we use V1 since there is enough work
-    # to parallelize.
-    use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)
-    if use_v1:
-        if IS_CUDA_SYSTEM:
-            from vllm._C import ops
-
-            ops.paged_attention_v1(
-                out,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-                "auto",
-                1.0,
-            )
-        elif IS_ROCM_SYSTEM:
-            from vllm import attention_ops
-
-            attention_ops.paged_attention_v1(
-                out,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-            )
-        else:
-            raise ValueError("vllm is not supported on your system")
-
-    else:
-        # Run PagedAttention V2.
-        assert _PARTITION_SIZE % block_size == 0
-        tmp_output = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions, head_size),
-            dtype=out.dtype,
-            device=out.device,
-        )
-        exp_sums = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions),
-            dtype=torch.float32,
-            device=out.device,
-        )
-        max_logits = torch.empty_like(exp_sums)
-
-        if IS_CUDA_SYSTEM:
-            from vllm._C import ops
-
-            ops.paged_attention_v2(
-                out,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-                "auto",
-                1.0,
-            )
-        elif IS_ROCM_SYSTEM:
-            from vllm import attention_ops
-
-            attention_ops.paged_attention_v2(
-                out,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-            )
-        else:
-            raise ValueError("vllm is not supported on your system")
--- a/server/vllm/.pylintrc
+++ b/server/vllm/.pylintrc
-# This Pylint rcfile contains a best-effort configuration to uphold the
-# best-practices and style described in the Google Python style guide:
-#   https://google.github.io/styleguide/pyguide.html
-#
-# Its canonical open-source location is:
-#   https://google.github.io/styleguide/pylintrc
-
-[MASTER]
-
-# Files or directories to be skipped. They should be base names, not paths.
-ignore=docs
-
-# Files or directories matching the regex patterns are skipped. The regex
-# matches against base names, not paths.
-ignore-patterns=
-
-# Pickle collected data for later comparisons.
-persistent=no
-
-# List of plugins (as comma separated values of python modules names) to load,
-# usually to register additional checkers.
-load-plugins=
-
-# Use multiple processes to speed up Pylint.
-jobs=4
-
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-
-
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
-confidence=
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-#enable=
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once).You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use"--disable=all --enable=classes
-# --disable=W"
-disable=abstract-method,
-        apply-builtin,
-        arguments-differ,
-        attribute-defined-outside-init,
-        backtick,
-        bad-option-value,
-        basestring-builtin,
-        buffer-builtin,
-        c-extension-no-member,
-        consider-using-enumerate,
-        cmp-builtin,
-        cmp-method,
-        coerce-builtin,
-        coerce-method,
-        delslice-method,
-        div-method,
-        duplicate-code,
-        eq-without-hash,
-        execfile-builtin,
-        file-builtin,
-        filter-builtin-not-iterating,
-        fixme,
-        getslice-method,
-        global-statement,
-        hex-method,
-        idiv-method,
-        implicit-str-concat-in-sequence,
-        import-error,
-        import-self,
-        import-star-module-level,
-        inconsistent-return-statements,
-        input-builtin,
-        intern-builtin,
-        invalid-str-codec,
-        locally-disabled,
-        logging-fstring-interpolation,  # added by vLLM
-        logging-not-lazy,  # added by vLLM
-        long-builtin,
-        long-suffix,
-        map-builtin-not-iterating,
-        misplaced-comparison-constant,
-        missing-class-docstring,  # TODO (vLLM): enable
-        missing-function-docstring,
-        missing-module-docstring,  # TODO (vLLM): enable
-        metaclass-assignment,
-        next-method-called,
-        next-method-defined,
-        no-absolute-import,
-        no-else-break,
-        no-else-continue,
-        no-else-raise,
-        no-else-return,
-        no-init,  # added
-        no-member,
-        no-name-in-module,
-        no-self-use,
-        nonzero-method,
-        oct-method,
-        old-division,
-        old-ne-operator,
-        old-octal-literal,
-        old-raise-syntax,
-        parameter-unpacking,
-        print-statement,
-        raising-string,
-        range-builtin-not-iterating,
-        raw_input-builtin,
-        rdiv-method,
-        reduce-builtin,
-        relative-import,
-        reload-builtin,
-        round-builtin,
-        setslice-method,
-        signature-differs,
-        standarderror-builtin,
-        suppressed-message,
-        sys-max-int,
-        too-few-public-methods,
-        too-many-ancestors,
-        too-many-arguments,
-        too-many-boolean-expressions,
-        too-many-branches,
-        too-many-instance-attributes,
-        too-many-locals,
-        too-many-nested-blocks,
-        too-many-public-methods,
-        too-many-return-statements,
-        too-many-statements,
-        trailing-newlines,
-        unichr-builtin,
-        unicode-builtin,
-        unnecessary-pass,
-        unpacking-in-except,
-        unspecified-encoding,
-        useless-else-on-loop,
-        useless-object-inheritance,
-        useless-suppression,
-        using-cmp-argument,
-        wrong-import-order,
-        xrange-builtin,
-        zip-builtin-not-iterating,
-
-
-[REPORTS]
-
-# Set the output format. Available formats are text, parseable, colorized, msvs
-# (visual studio) and html. You can also give a reporter class, eg
-# mypackage.mymodule.MyReporterClass.
-output-format=text
-
-# Tells whether to display a full report or only the messages
-reports=no
-
-# Python expression which should return a note less than 10 (10 is the highest
-# note). You have access to the variables errors warning, statement which
-# respectively contain the number of errors / warnings messages and the total
-# number of statements analyzed. This is used by the global evaluation report
-# (RP0004).
-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details
-#msg-template=
-
-
-[BASIC]
-
-# Good variable names which should always be accepted, separated by a comma
-good-names=main,_
-
-# Bad variable names which should always be refused, separated by a comma
-bad-names=
-
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-
-# Include a hint for the correct naming format with invalid-name
-include-naming-hint=no
-
-# List of decorators that produce properties, such as abc.abstractproperty. Add
-# to this list to register other decorators that produce valid properties.
-property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
-
-# Regular expression matching correct function names
-function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
-
-# Regular expression matching correct variable names
-variable-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression matching correct constant names
-const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-
-# Regular expression matching correct attribute names
-attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
-
-# Regular expression matching correct argument names
-argument-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression matching correct class attribute names
-class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-
-# Regular expression matching correct inline iteration names
-inlinevar-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression matching correct class names
-class-rgx=^_?[A-Z][a-zA-Z0-9]*$
-
-# Regular expression matching correct module names
-module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
-
-# Regular expression matching correct method names
-method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
-
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=10
-
-
-[TYPECHECK]
-
-# List of decorators that produce context managers, such as
-# contextlib.contextmanager. Add to this list to register other decorators that
-# produce valid context managers.
-contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
-
-# Tells whether missing members accessed in mixin class should be ignored. A
-# mixin class is detected if its name ends with "mixin" (case insensitive).
-ignore-mixin-members=yes
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis. It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
-
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=
-
-
-[FORMAT]
-
-# Maximum number of characters on a single line.
-max-line-length=80
-
-# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
-# lines made too long by directives to pytype.
-
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=(?x)(
-  ^\s*(\#\ )?<?https?://\S+>?$|
-  ^\s*(from\s+\S+\s+)?import\s+.+$)
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=yes
-
-# Maximum number of lines in a module
-max-module-lines=99999
-
-# String used as indentation unit.  The internal Google style guide mandates 2
-# spaces.  Google's externaly-published style guide says 4, consistent with
-# PEP 8.  Here, we use 2 spaces, for conformity with many open-sourced Google
-# projects (like TensorFlow).
-indent-string='    '
-
-# Number of spaces of indent required inside a hanging  or continued line.
-indent-after-paren=4
-
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=TODO
-
-
-[STRING]
-
-# This flag controls whether inconsistent-quotes generates a warning when the
-# character used as a quote delimiter is used inconsistently within a module.
-check-quote-consistency=yes
-
-
-[VARIABLES]
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# A regular expression matching the name of dummy variables (i.e. expectedly
-# not used).
-dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid to define new builtins when possible.
-additional-builtins=
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,_cb
-
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
-
-
-[LOGGING]
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format
-logging-modules=logging,absl.logging,tensorflow.io.logging
-
-
-[SIMILARITIES]
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
-
-
-[SPELLING]
-
-# Spelling dictionary name. Available dictionaries: none. To make it working
-# install python-enchant package.
-spelling-dict=
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# A path to a file that contains private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to indicated private dictionary in
-# --spelling-private-dict-file option instead of raising a message.
-spelling-store-unknown-words=no
-
-
-[IMPORTS]
-
-# Deprecated modules which should not be used, separated by a comma
-deprecated-modules=regsub,
-                   TERMIOS,
-                   Bastion,
-                   rexec,
-                   sets
-
-# Create a graph of every (i.e. internal and external) dependencies in the
-# given file (report RP0402 must not be disabled)
-import-graph=
-
-# Create a graph of external dependencies in the given file (report RP0402 must
-# not be disabled)
-ext-import-graph=
-
-# Create a graph of internal dependencies in the given file (report RP0402 must
-# not be disabled)
-int-import-graph=
-
-# Force import order to recognize a module as part of the standard
-# compatibility libraries.
-known-standard-library=
-
-# Force import order to recognize a module as part of a third party library.
-known-third-party=enchant, absl
-
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-
-
-[CLASSES]
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,
-                      __new__,
-                      setUp
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,
-                  _fields,
-                  _replace,
-                  _source,
-                  _make
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls,
-                            class_
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=mcs
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=StandardError,
-                       Exception,
-                       BaseException
--- a/server/vllm/.readthedocs.yaml
+++ b/server/vllm/.readthedocs.yaml
-# Read the Docs configuration file
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-
-version: 2
-
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.8"
-
-sphinx:
-   configuration: docs/source/conf.py
-
-# If using Sphinx, optionally build your docs in additional formats such as PDF
-formats:
-   - pdf
-
-# Optionally declare the Python requirements required to build your docs
-python:
-   install:
-   - requirements: docs/requirements-docs.txt