update

e019635f · xuxzh1 · 64def8e2 · 64def8e2 · 64def8e2 · 64def8e2
Commit e019635f authored Nov 01, 2024 by xuxzh1 🎱
20 changed files
--- a/load_tests/starcoder_load.js
+++ b/load_tests/starcoder_load.js
-import {check} from 'k6';
-import http from 'k6/http';
-import {Trend} from 'k6/metrics';
-const host = __ENV.HOST || '127.0.0.1:3000';
-const totalTime = new Trend('total_time', true);
-const validationTime = new Trend('validation_time', true);
-const queueTime = new Trend('queue_time', true);
-const inferenceTime = new Trend('inference_time', true);
-const timePerToken = new Trend('time_per_token', true);
-const example = {
-    payload: JSON.stringify({
-        inputs: '# This is a fibonacci function written in the Python programming language.' +
-            'def fibonacci',
-        parameters: {
-            details: true,
-            max_new_tokens: 60,
-            temperature: 0.2,
-            top_p: 0.95,
-            seed: 0,
-        },
-    }),
-    generated_tokens: 60
-};
-export const options = {
-    thresholds: {
-        http_req_failed: ['rate==0'],
-        time_per_token: ['p(95)<90'],
-        queue_time: ['p(95)<1500'],
-    },
-    scenarios: {
-        load_test: {
-            executor: 'constant-arrival-rate',
-            duration: '60s',
-            preAllocatedVUs: 100,
-            rate: 10,
-            timeUnit: '1s',
-        },
-    },
-};
-export default function () {
-    const headers = {'Content-Type': 'application/json'};
-    const res = http.post(`http://${host}/generate`, example.payload, {
-        headers,
-    });
-    check(res, {
-        'Post status is 200': (r) => res.status === 200,
-        'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens,
-    });
-    if (res.status === 200) {
-        totalTime.add(res.headers["X-Total-Time"]);
-        validationTime.add(res.headers["X-Validation-Time"]);
-        queueTime.add(res.headers["X-Queue-Time"]);
-        inferenceTime.add(res.headers["X-Inference-Time"]);
-        timePerToken.add(res.headers["X-Time-Per-Token"]);
-    }
-}
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
-/// Single shard Client
-use crate::pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
-use crate::pb::generate::v2::*;
-use crate::Result;
-use grpc_metadata::InjectTelemetryContext;
-use std::cmp::min;
-use std::time::Duration;
-use tonic::transport::{Channel, Uri};
-use tracing::instrument;
-/// Text Generation Inference gRPC client
-#[derive(Debug, Clone)]
-pub struct Client {
-    stub: TextGenerationServiceClient<Channel>,
-}
-impl Client {
-    /// Returns a client connected to the given url
-    pub async fn connect(uri: Uri) -> Result<Self> {
-        let channel = Channel::builder(uri).connect().await?;
-        Ok(Self {
-            stub: TextGenerationServiceClient::new(channel),
-        })
-    }
-    /// Returns a client connected to the given unix socket
-    pub async fn connect_uds(path: String) -> Result<Self> {
-        let channel = Channel::from_shared("http://[::]:50051".to_string())
-            .unwrap()
-            .connect_with_connector(tower::service_fn(move |_: Uri| {
-                tokio::net::UnixStream::connect(path.clone())
-            }))
-            .await?;
-        Ok(Self {
-            stub: TextGenerationServiceClient::new(channel),
-        })
-    }
-    /// Returns a list of uris or unix sockets of all shards
-    #[instrument(skip(self))]
-    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
-        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
-        let response = self.stub.service_discovery(request).await?;
-        let urls = response
-            .into_inner()
-            .urls
-            .into_iter()
-            // Remove unix socket prefix
-            .map(|url| match url.strip_prefix("unix://") {
-                None => url,
-                Some(stripped_url) => stripped_url.to_string(),
-            })
-            .collect();
-        Ok(urls)
-    }
-    /// Get model info
-    #[instrument(skip(self))]
-    pub async fn info(&mut self) -> Result<InfoResponse> {
-        let request = tonic::Request::new(InfoRequest {}).inject_context();
-        let response = self.stub.info(request).await?.into_inner();
-        Ok(response)
-    }
-    /// Get model health
-    #[instrument(skip(self))]
-    pub async fn health(&mut self) -> Result<HealthResponse> {
-        let request = tonic::Request::new(HealthRequest {}).inject_context();
-        let response = self.stub.health(request).await?.into_inner();
-        Ok(response)
-    }
-    /// Clear the past generations cache
-    #[instrument(skip(self))]
-    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
-        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
-        self.stub.clear_cache(request).await?;
-        Ok(())
-    }
-    /// Filter a cached batch
-    #[instrument(skip(self))]
-    pub async fn filter_batch(
-        &mut self,
-        batch_id: u64,
-        request_ids: Vec<u64>,
-    ) -> Result<Option<CachedBatch>> {
-        let request = tonic::Request::new(FilterBatchRequest {
-            batch_id,
-            request_ids,
-        })
-        .inject_context();
-        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
-        Ok(filtered_batch.batch)
-    }
-    /// Warmup on a max size batch
-    ///
-    /// Returns the maximum amount of tokens supported by the hardware
-    #[instrument(skip_all)]
-    pub async fn warmup(
-        &mut self,
-        max_input_length: u32,
-        max_prefill_tokens: u32,
-        max_total_tokens: u32,
-        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
-        let mut n_tokens = 0;
-        let mut requests = Vec::new();
-        // Create requests
-        while n_tokens < max_prefill_tokens {
-            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
-            let mut inputs = String::new();
-            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
-            if n_tokens == 0 {
-                // 1 request is enough to test vision heads.
-                // Sending images on other queries messes up easily with truncation.
-                inputs.push_str("![](data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=)");
-            }
-            requests.push(Request {
-                id: 0,
-                // We truncate the input on the server side to be sure that it has the correct size
-                inputs,
-                truncate,
-                // Set sampling parameters to also take these ops into account in the max memory
-                parameters: Some(NextTokenChooserParameters {
-                    temperature: 0.9,
-                    top_k: 10,
-                    top_p: 0.9,
-                    typical_p: 0.9,
-                    do_sample: false,
-                    seed: 0,
-                    repetition_penalty: 1.2,
-                    frequency_penalty: 0.1,
-                    watermark: true,
-                    grammar: String::new(),
-                    grammar_type: GrammarType::None as i32,
-                }),
-                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: max_total_tokens - truncate,
-                    stop_sequences: vec![],
-                    ignore_eos_token: true,
-                }),
-                prefill_logprobs: true,
-                top_n_tokens: 20,
-            });
-            n_tokens += max_input_length;
-            // Check max_batch_size
-            if Some(requests.len()) == max_batch_size {
-                break;
-            }
-        }
-        let batch = Batch {
-            id: 0,
-            size: requests.len() as u32,
-            requests,
-            max_tokens: 0,
-        };
-        let request = tonic::Request::new(WarmupRequest {
-            batch: Some(batch),
-            max_input_length,
-            max_prefill_tokens,
-            max_total_tokens,
-        })
-        .inject_context();
-        let response = self.stub.warmup(request).await?.into_inner();
-        Ok(response.max_supported_total_tokens)
-    }
-    /// Generate one token for each request in the given batch
-    ///
-    /// Returns Generation for each request in batch
-    /// and the next cached batch
-    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
-    pub async fn prefill(
-        &mut self,
-        batch: Batch,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
-        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
-        let response = self.stub.prefill(request).await?.into_inner();
-        Ok((
-            response.generations,
-            response.batch,
-            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
-        ))
-    }
-    /// Generate one token for each request in the given cached batches
-    ///
-    /// Returns Generation for each request in batches
-    /// and the next cached batch
-    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
-    pub async fn decode(
-        &mut self,
-        batches: Vec<CachedBatch>,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
-        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
-        let response = self.stub.decode(request).await?.into_inner();
-        Ok((
-            response.generations,
-            response.batch,
-            DecodeTimings::new(
-                response.concat_ns,
-                response.forward_ns,
-                response.decode_ns,
-                response.total_ns,
-            ),
-        ))
-    }
-}
-pub struct PrefillTimings {
-    pub forward: Duration,
-    pub decode: Duration,
-    pub total: Duration,
-}
-impl PrefillTimings {
-    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
-        Self {
-            forward: Duration::from_nanos(forward_ns),
-            decode: Duration::from_nanos(decode_ns),
-            total: Duration::from_nanos(total_ns),
-        }
-    }
-}
-pub struct DecodeTimings {
-    pub concat: Option<Duration>,
-    pub forward: Duration,
-    pub decode: Duration,
-    pub total: Duration,
-}
-impl DecodeTimings {
-    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
-        Self {
-            concat: concat_ns.map(Duration::from_nanos),
-            forward: Duration::from_nanos(forward_ns),
-            decode: Duration::from_nanos(decode_ns),
-            total: Duration::from_nanos(total_ns),
-        }
-    }
-}
--- a/router/client/src/pb/.gitignore
+++ b/router/client/src/pb/.gitignore
-*.rs
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
-use crate::client::{DecodeTimings, PrefillTimings};
-/// Multi shard Client
-use crate::{Batch, CachedBatch, Client, Generation, HealthResponse, ShardInfo};
-use crate::{ClientError, Result};
-use futures::future::join_all;
-use tonic::transport::Uri;
-use tracing::instrument;
-#[derive(Debug, Clone)]
-/// Text Generation Inference gRPC multi client
-pub struct ShardedClient {
-    clients: Vec<Client>,
-}
-impl ShardedClient {
-    fn new(clients: Vec<Client>) -> Self {
-        Self { clients }
-    }
-    /// Create a new ShardedClient from a master client. The master client will communicate with
-    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
-    async fn from_master_client(mut master_client: Client) -> Result<Self> {
-        // Get all uris/unix sockets from the master client
-        let uris = master_client.service_discovery().await?;
-        let futures = uris.into_iter().map(Client::connect_uds);
-        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
-        Ok(Self::new(clients?))
-    }
-    /// Returns a client connected to the given uri
-    pub async fn connect(uri: Uri) -> Result<Self> {
-        let master_client = Client::connect(uri).await?;
-        Self::from_master_client(master_client).await
-    }
-    /// Returns a client connected to the given unix socket
-    pub async fn connect_uds(path: String) -> Result<Self> {
-        let master_client = Client::connect_uds(path).await?;
-        Self::from_master_client(master_client).await
-    }
-    /// Get the model info
-    #[instrument(skip(self))]
-    pub async fn info(&mut self) -> Result<ShardInfo> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| client.info())
-            .collect();
-        join_all(futures).await.pop().unwrap()
-    }
-    /// GRPC health check
-    #[instrument(skip(self))]
-    pub async fn health(&mut self) -> Result<HealthResponse> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| client.health())
-            .collect();
-        join_all(futures).await.pop().unwrap()
-    }
-    /// Clear the past generations cache
-    #[instrument(skip(self))]
-    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| client.clear_cache(batch_id))
-            .collect();
-        join_all(futures).await.into_iter().collect()
-    }
-    /// Filter a cached batch
-    #[instrument(skip(self))]
-    pub async fn filter_batch(
-        &mut self,
-        batch_id: u64,
-        request_ids: Vec<u64>,
-    ) -> Result<Option<CachedBatch>> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
-            .collect();
-        // all shards return the same message
-        join_all(futures).await.pop().unwrap()
-    }
-    /// Warmup on a max size batch
-    ///
-    /// Returns the maximum amount of tokens supported by the hardware
-    #[instrument(skip(self))]
-    pub async fn warmup(
-        &mut self,
-        max_input_length: u32,
-        max_prefill_tokens: u32,
-        max_total_tokens: u32,
-        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| {
-                Box::pin(client.warmup(
-                    max_input_length,
-                    max_prefill_tokens,
-                    max_total_tokens,
-                    max_batch_size,
-                ))
-            })
-            .collect();
-        // Take the minimum value
-        let results = join_all(futures)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<Option<u32>>>>()?;
-        Ok(results.into_iter().flatten().min())
-    }
-    /// Generate one token for each request in the given batch
-    ///
-    /// Returns Generation for each request in batch
-    /// and the next cached batch
-    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
-    pub async fn prefill(
-        &mut self,
-        batch: Batch,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| Box::pin(client.prefill(batch.clone())))
-            .collect();
-        #[allow(clippy::type_complexity)]
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
-            join_all(futures).await.into_iter().collect();
-        let mut results = results?;
-        let (mut generations, next_batch, mut timings) =
-            results.pop().ok_or(ClientError::EmptyResults)?;
-        // Merge generations from different model shards
-        for (mut shard_generations, _, shard_timings) in results.into_iter() {
-            generations.append(&mut shard_generations);
-            // Return the timings of the slowest shard
-            if shard_timings.total > timings.total {
-                timings = shard_timings;
-            }
-        }
-        Ok((generations, next_batch, timings))
-    }
-    /// Generate one token for each request in the given cached batches
-    ///
-    /// Returns Generation for each request in batches
-    /// and the next cached batch
-    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
-    pub async fn decode(
-        &mut self,
-        batches: Vec<CachedBatch>,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
-        let futures: Vec<_> = self
-            .clients
-            .iter_mut()
-            .map(|client| Box::pin(client.decode(batches.clone())))
-            .collect();
-        #[allow(clippy::type_complexity)]
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
-            join_all(futures).await.into_iter().collect();
-        let mut results = results?;
-        let (mut generations, next_batch, mut timings) =
-            results.pop().ok_or(ClientError::EmptyResults)?;
-        // Merge generations from different model shards
-        for (mut shard_generations, _, shard_timings) in results.into_iter() {
-            generations.append(&mut shard_generations);
-            // Return the timings of the slowest shard
-            if shard_timings.total > timings.total {
-                timings = shard_timings;
-            }
-        }
-        Ok((generations, next_batch, timings))
-    }
-}
--- a/router/src/health.rs
+++ b/router/src/health.rs
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-use text_generation_client::GrammarType as ProtoGrammarType;
-use text_generation_client::{
-    Batch, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
-};
-// Note: Request ids and batch ids cannot collide.
-const LIVENESS_ID: u64 = u64::MAX;
-const BATCH_ID: u64 = u64::MAX;
-#[derive(Clone, Debug)]
-pub(crate) struct Health {
-    client: ShardedClient,
-    generation_health: Arc<AtomicBool>,
-}
-impl Health {
-    pub(crate) fn new(client: ShardedClient, generation_health: Arc<AtomicBool>) -> Self {
-        Self {
-            client,
-            generation_health,
-        }
-    }
-    pub(crate) async fn check(&mut self) -> bool {
-        if self.generation_health.load(Ordering::SeqCst) {
-            // Generation is healthy, we only check that the shards are answering gRPC calls
-            self.client.health().await.is_ok()
-        } else {
-            // Generation is unhealthy or have not sent any generation request yet
-            // Dummy batch of 1 token and 1 generated token
-            let liveness_request = Request {
-                id: LIVENESS_ID,
-                inputs: "liveness".to_string(),
-                truncate: 10,
-                prefill_logprobs: false,
-                parameters: Some(NextTokenChooserParameters {
-                    temperature: 1.0,
-                    top_k: 0,
-                    top_p: 1.0,
-                    typical_p: 1.0,
-                    do_sample: false,
-                    seed: 0,
-                    repetition_penalty: 1.0,
-                    frequency_penalty: 0.0,
-                    watermark: false,
-                    grammar: String::new(),
-                    grammar_type: ProtoGrammarType::None as i32,
-                }),
-                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: 1,
-                    stop_sequences: vec![],
-                    ignore_eos_token: false,
-                }),
-                top_n_tokens: 0,
-            };
-            let batch = Batch {
-                id: BATCH_ID,
-                requests: vec![liveness_request],
-                size: 1,
-                max_tokens: 2,
-            };
-            // Skips the queue
-            let value = self.client.prefill(batch).await.is_ok();
-            // Update generation health
-            self.generation_health.store(value, Ordering::SeqCst);
-            value
-        }
-    }
-}
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
--- a/router/src/queue.rs
+++ b/router/src/queue.rs
--- a/server/text_generation_server/models/cache_manager.py
+++ b/server/text_generation_server/models/cache_manager.py
-import math
-import torch
-from typing import Optional, List, Tuple
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
-BLOCK_SIZE: int = 16
-# Will be set in warmup
-CACHE_MANAGER: Optional["CacheManager"] = None
-class CacheManager:
-    def __init__(
-        self,
-        num_blocks: int,
-        num_layers: int,
-        num_heads: int,
-        head_size: int,
-        repeat_slots: bool,
-        dtype: torch.dtype,
-        device: torch.device,
-    ):
-        self.block_size = BLOCK_SIZE
-        self.num_blocks = num_blocks
-        self.repeat_slots = repeat_slots
-        element_size = torch.tensor([], dtype=dtype).element_size()
-        if IS_XPU_SYSTEM:
-            x = 1
-        else:
-            x = self.block_size // element_size
-        self.kv_cache = [
-            (
-                torch.empty(
-                    (num_blocks, num_heads, head_size // x, self.block_size, x),
-                    dtype=dtype,
-                    device=device,
-                ),
-                torch.empty(
-                    (num_blocks, num_heads, head_size, self.block_size),
-                    dtype=dtype,
-                    device=device,
-                ),
-            )
-            for _ in range(num_layers)
-        ]
-        self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
-        self.slots = torch.arange(
-            0, num_blocks * self.block_size, dtype=torch.int64
-        ).view(num_blocks, self.block_size)
-    def allocate(
-        self,
-        needed_blocks_slots: List[Tuple[int, int]],
-        blocks: int,
-        max_blocks: int,
-        device: torch.device,
-    ):
-        # Get free blocks indices by finding values in mask that are not set to 0
-        free_block_indices = self.free_block_mask.nonzero()
-        if blocks > len(free_block_indices):
-            raise RuntimeError(
-                f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
-            )
-        # Slice by the number of required blocks
-        block_indices = free_block_indices[:blocks]
-        block_indices = block_indices.flatten()
-        # Padded block tables
-        block_tables_tensor = torch.zeros(
-            (len(needed_blocks_slots), max_blocks), dtype=torch.int32
-        )
-        # Allocate paged attention blocks
-        cumulative_blocks = 0
-        slots = []
-        block_tables = []
-        for i, (needed_blocks, needed_slots) in enumerate(needed_blocks_slots):
-            # Get allocated blocks for this sequence
-            allocated_blocks = block_indices[
-                cumulative_blocks : cumulative_blocks + needed_blocks
-            ]
-            # Get slots for the allocated blocks
-            all_slots = self.slots[allocated_blocks].flatten()
-            # Repeat slots in the case of context sliding window
-            if needed_slots > len(all_slots) and self.repeat_slots:
-                repeats = math.ceil(needed_slots / len(all_slots))
-                all_slots = all_slots.repeat(repeats)
-            allocated_slots = all_slots[:needed_slots]
-            slots.append(allocated_slots)
-            block_tables.append(allocated_blocks.tolist())
-            block_tables_tensor[i, :needed_blocks] = allocated_blocks
-            cumulative_blocks += needed_blocks
-        block_tables = block_tables
-        block_tables_tensor = block_tables_tensor.to(device)
-        slots = torch.concat(slots).to(device)
-        # Allocate the required number of blocks by setting the mask to 0
-        self.free_block_mask[block_indices] = 0
-        return block_tables, block_tables_tensor, slots
-    def free(self, block_indices: Optional[List[int]]):
-        if block_indices is not None and block_indices:
-            # Reset mask
-            self.free_block_mask[block_indices] = 1
-def set_cache_manager(
-    num_blocks: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    repeat_slots: bool,
-    dtype: torch.dtype,
-    device: torch.device,
-) -> CacheManager:
-    global CACHE_MANAGER
-    if CACHE_MANAGER is not None:
-        del CACHE_MANAGER
-        torch.cuda.empty_cache()
-    CACHE_MANAGER = CacheManager(
-        num_blocks, num_layers, num_heads, head_size, repeat_slots, dtype, device
-    )
-    return CACHE_MANAGER
-def get_cache_manager() -> CacheManager:
-    global CACHE_MANAGER
-    if CACHE_MANAGER is None:
-        raise RuntimeError("cache manager was not initialized")
-    return CACHE_MANAGER
--- a/server/text_generation_server/utils/awq/conversion_utils.py
+++ b/server/text_generation_server/utils/awq/conversion_utils.py
-import torch
-from typing import List
-AWQ_PACK_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
-REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
-def pack(imatrix: torch.Tensor, direction: str = "column"):
-    """
-    Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
-    Args:
-        imatrix (torch.Tensor): matrix of integers
-        direction (str): direction of packing, either "column" or "row"
-    Returns:
-        qmatrix (torch.Tensor): packed matrix of integers
-    """
-    shifts = torch.arange(0, 32, 4, dtype=torch.int32, device=imatrix.device)
-    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
-    if direction == "column":
-        imatrix = imatrix.view(-1, imatrix.shape[1] // (32 // 4), (32 // 4))
-        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, :]).sum(dim=-1)
-    elif direction == "row":
-        imatrix = imatrix.view(imatrix.shape[0] // (32 // 4), (32 // 4), -1)
-        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, :, None]).sum(dim=1)
-    qmatrix = qmatrix.to(torch.int32)
-    return qmatrix
-def unpack(qmatrix: torch.Tensor, direction: str = "column"):
-    """
-    Unpacks a 32-bit packed integer matrix into a 4-bit integer matrix.
-    Args:
-        qmatrix (torch.Tensor): matrix of packed integers
-        direction (str): direction of unpacking, either "column" or "row"
-    Returns:
-        imatrix (torch.Tensor): matrix of integers
-    """
-    shifts = torch.arange(0, 32, 4, device=qmatrix.device)
-    if direction == "column":
-        imatrix = torch.bitwise_right_shift(
-            qmatrix[:, :, None], shifts[None, None, :]
-        ).view(qmatrix.shape[0], -1)
-    elif direction == "row":
-        imatrix = torch.bitwise_right_shift(
-            qmatrix[:, None, :], shifts[None, :, None]
-        ).view(-1, qmatrix.shape[-1])
-    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
-    return imatrix
-def apply_order(
-    imatrix: torch.Tensor,
-    direction: str = "column",
-    order: List[int] = AWQ_PACK_ORDER,
-):
-    """
-    Applies the order to a 4-bit integer matrix.
-    Args:
-        imatrix (torch.Tensor): matrix of integers
-        direction (str): direction of applying order, either "column" or "row"
-        order (List[int]): order to apply, default is AWQ_PACK_ORDER
-    Returns:
-        imatrix (torch.Tensor): matrix of integers
-    """
-    if direction == "column":
-        imatrix = imatrix.view(-1, (32 // 4))[:, order].view(imatrix.shape)
-    elif direction == "row":
-        imatrix = imatrix.view((32 // 4), -1)[order, :].view(imatrix.shape)
-    return imatrix
-def fast_awq_to_gptq(qweight, qzeros):
-    # awq uses column packing for both weights and zeros
-    izeros = unpack(qzeros, direction="column")
-    iweights = unpack(qweight, direction="column")
-    # Reverse the order of the iweight and izeros tensors
-    izeros = apply_order(izeros, direction="column", order=REVERSE_AWQ_PACK_ORDER)
-    iweights = apply_order(iweights, direction="column", order=REVERSE_AWQ_PACK_ORDER)
-    # Subtract 1 from the izeros tensor (gptq adds 1 to the zeros)
-    izeros = izeros - 1
-    # exllama uses row packing for weights and column packing for zeros
-    qzeros = pack(izeros, direction="column")
-    qweight = pack(iweights, direction="row")
-    return qweight, qzeros
--- a/server/text_generation_server/utils/awq/quantize/qmodule.py
+++ b/server/text_generation_server/utils/awq/quantize/qmodule.py
-# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py
-import math
-import torch
-import torch.nn as nn
-import awq_inference_engine  # with CUDA kernels
-# class ScaledActivation(nn.Module):
-#     def __init__(self, module, scales):
-#         super().__init__()
-#         self.act = module
-#         self.scales = nn.Parameter(scales.data)
-#
-#     def forward(self, x):
-#         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
-class WQLinear(nn.Module):
-    def __init__(self, w_bit, group_size, qweight, qzeros, scales, bias):
-        super().__init__()
-        if w_bit not in [4]:
-            raise NotImplementedError("Only 4-bit are supported for now.")
-        self.in_features = qweight.shape[0]
-        self.out_features = qweight.shape[1] * 32 // w_bit
-        self.w_bit = w_bit
-        self.group_size = group_size if group_size != -1 else self.in_features
-        # quick sanity check (make sure aligment)
-        assert self.in_features % self.group_size == 0
-        assert self.out_features % (32 // self.w_bit) == 0
-        self.qweight = qweight
-        self.qzeros = qzeros
-        self.scales = scales
-        if bias:
-            self.bias = bias
-        else:
-            self.bias = None
-    @torch.no_grad()
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.out_features,)
-        out = awq_inference_engine.gemm_forward_cuda(
-            x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
--- a/server/text_generation_server/utils/flash_attn.py
+++ b/server/text_generation_server/utils/flash_attn.py
-import os
-import torch
-from loguru import logger
-import math
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
-if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
-    raise ImportError("`USE_FLASH_ATTENTION` is false.")
-HAS_FLASH_ATTN = True
-HAS_FLASH_ATTN_V2_CUDA = False
-HAS_FLASH_ATTN_V2_ROCM = False
-if IS_XPU_SYSTEM:
-    import intel_extension_for_pytorch as ipex
-if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-    if not torch.cuda.is_available():
-        raise ImportError("CUDA is not available")
-    major, minor = torch.cuda.get_device_capability()
-    is_sm75 = major == 7 and minor == 5
-    is_sm8x = major == 8 and minor >= 0
-    is_sm90 = major == 9 and minor == 0
-    HAS_FLASH_ATTN = False
-    HAS_FLASH_ATTN_V2_CUDA = False
-    HAS_FLASH_ATTN_V2_ROCM = False
-    try:
-        try:
-            import flash_attn_2_cuda
-        except ImportError:
-            architecture_suffix = ""
-            if IS_CUDA_SYSTEM:
-                architecture_suffix = "-cuda"
-            elif IS_ROCM_SYSTEM:
-                architecture_suffix = "-rocm"
-            raise ImportError(
-                "Flash Attention V2 is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
-            )
-        if not (is_sm8x or is_sm90) and IS_CUDA_SYSTEM:
-            raise ImportError(
-                f"GPU with CUDA capability {major} {minor} is not supported for "
-                "Flash Attention V2"
-            )
-        HAS_FLASH_ATTN_V2_CUDA = IS_CUDA_SYSTEM
-        HAS_FLASH_ATTN_V2_ROCM = IS_ROCM_SYSTEM
-    except ImportError as e:
-        try:
-            import flash_attn_cuda
-        except ImportError:
-            raise ImportError(
-                "Flash Attention is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                "or install flash attention with `cd server && make install install-flash-attention`"
-            ) from e
-        if IS_CUDA_SYSTEM and not (is_sm75 or is_sm8x or is_sm90):
-            raise ImportError(
-                f"GPU with CUDA capability {major} {minor} is not supported"
-            ) from e
-        elif IS_ROCM_SYSTEM:
-            for idx in range(torch.cuda.device_count()):
-                if "MI210" not in torch.cuda.get_device_name(
-                    idx
-                ) and "MI250" not in torch.cuda.get_device_name(idx):
-                    raise ImportError(
-                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
-                    )
-        logger.warning(f"Unable to use Flash Attention V2: {e}")
-        HAS_FLASH_ATTN = True
-def attention(
-    q,
-    k,
-    v,
-    out,
-    cu_seqlens,
-    max_s,
-    softmax_scale,
-    window_size_left=-1,
-):
-    if window_size_left <= 0 and window_size_left != -1:
-        raise ValueError("`window_size_left` must be > 0 or -1")
-    if IS_XPU_SYSTEM:
-        if window_size_left != -1:
-            raise ValueError(
-                f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
-            )
-        return ipex.llm.functional.varlen_attention(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            False,
-            None,
-        )
-    if HAS_FLASH_ATTN_V2_CUDA:
-        return flash_attn_2_cuda.varlen_fwd(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            None,
-            None,
-            None,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            window_size_left,
-            0,
-            False,
-            None,
-        )
-    elif HAS_FLASH_ATTN_V2_ROCM:
-        if window_size_left != -1:
-            raise ValueError(
-                f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
-            )
-        # RoCm flash API does not take the window_size_left and window_size_right arguments.
-        return flash_attn_2_cuda.varlen_fwd(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            False,
-            None,
-        )
-    elif HAS_FLASH_ATTN:
-        if window_size_left != -1:
-            raise NotImplementedError(
-                "window_size_left is only available with flash attn v2"
-            )
-        # Flash attention v1 requires q, k and v to have the same number of heads
-        if k.shape[1] != q.shape[1]:
-            # MQA expand
-            if k.shape[1] == 1:
-                k = k.expand(-1, q.shape[1], -1)
-            # Grouped attention reshape
-            else:
-                original_shape = k.shape
-                k = (
-                    k.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
-                    .reshape(original_shape[0], -1, original_shape[2])
-                )
-        if v.shape[1] != q.shape[1]:
-            # MQA expand
-            if v.shape[1] == 1:
-                v = v.expand(-1, q.shape[1], -1)
-            # Grouped attention reshape
-            else:
-                original_shape = v.shape
-                v = (
-                    v.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
-                    .reshape(original_shape[0], -1, original_shape[2])
-                )
-        return flash_attn_cuda.fwd(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            False,
-            0,
-            None,
-        )
-    raise NotImplementedError("flash attention is not installed")
--- a/server/text_generation_server/utils/gptq/custom_autotune.py
+++ b/server/text_generation_server/utils/gptq/custom_autotune.py
-# https://github.com/fpgaminer/GPTQ-triton
-"""
-Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
-"""
-import builtins
-import math
-import time
-from typing import Dict
-import triton
-class Autotuner(triton.KernelInterface):
-    def __init__(
-        self,
-        fn,
-        arg_names,
-        configs,
-        key,
-        reset_to_zero,
-        prune_configs_by: Dict = None,
-        nearest_power_of_two: bool = False,
-    ):
-        """
-        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
-                'perf_model': performance model used to predicate running time with different configs, returns running time
-                'top_k': number of configs to bench
-                'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
-                'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
-        """
-        if not configs:
-            self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
-        else:
-            self.configs = configs
-        self.key_idx = [arg_names.index(k) for k in key]
-        self.nearest_power_of_two = nearest_power_of_two
-        self.cache = {}
-        # hook to reset all required tensor to zeros before relaunching a kernel
-        self.hook = lambda args: 0
-        if reset_to_zero is not None:
-            self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
-            def _hook(args):
-                for i in self.reset_idx:
-                    args[i].zero_()
-            self.hook = _hook
-        self.arg_names = arg_names
-        # prune configs
-        if prune_configs_by:
-            perf_model, top_k = (
-                prune_configs_by["perf_model"],
-                prune_configs_by["top_k"],
-            )
-            if "early_config_prune" in prune_configs_by:
-                early_config_prune = prune_configs_by["early_config_prune"]
-        else:
-            perf_model, top_k, early_config_prune = None, None, None
-        self.perf_model, self.configs_top_k = perf_model, top_k
-        self.early_config_prune = early_config_prune
-        self.fn = fn
-    def _bench(self, *args, config, **meta):
-        # check for conflicts, i.e. meta-parameters both provided
-        # as kwargs and by the autotuner
-        conflicts = meta.keys() & config.kwargs.keys()
-        if conflicts:
-            raise ValueError(
-                f"Conflicting meta-parameters: {', '.join(conflicts)}."
-                " Make sure that you don't re-define auto-tuned symbols."
-            )
-        # augment meta-parameters with tunable ones
-        current = dict(meta, **config.kwargs)
-        def kernel_call():
-            if config.pre_hook:
-                config.pre_hook(self.nargs)
-            self.hook(args)
-            self.fn.run(
-                *args,
-                num_warps=config.num_warps,
-                num_stages=config.num_stages,
-                **current,
-            )
-        try:
-            # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
-            # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
-            return triton.testing.do_bench(
-                kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40
-            )
-        except triton.OutOfResources:
-            return (float("inf"), float("inf"), float("inf"))
-    def run(self, *args, **kwargs):
-        self.nargs = dict(zip(self.arg_names, args))
-        if len(self.configs) > 1:
-            key = tuple(args[i] for i in self.key_idx)
-            # This reduces the amount of autotuning by rounding the keys to the nearest power of two
-            # In my testing this gives decent results, and greatly reduces the amount of tuning required
-            if self.nearest_power_of_two:
-                key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])
-            if key not in self.cache:
-                # prune configs
-                pruned_configs = self.prune_configs(kwargs)
-                bench_start = time.time()
-                timings = {
-                    config: self._bench(*args, config=config, **kwargs)
-                    for config in pruned_configs
-                }
-                bench_end = time.time()
-                self.bench_time = bench_end - bench_start
-                self.cache[key] = builtins.min(timings, key=timings.get)
-                self.hook(args)
-                self.configs_timings = timings
-            config = self.cache[key]
-        else:
-            config = self.configs[0]
-        self.best_config = config
-        if config.pre_hook is not None:
-            config.pre_hook(self.nargs)
-        return self.fn.run(
-            *args,
-            num_warps=config.num_warps,
-            num_stages=config.num_stages,
-            **kwargs,
-            **config.kwargs,
-        )
-    def prune_configs(self, kwargs):
-        pruned_configs = self.configs
-        if self.early_config_prune:
-            pruned_configs = self.early_config_prune(self.configs, self.nargs)
-        if self.perf_model:
-            top_k = self.configs_top_k
-            if isinstance(top_k, float) and top_k <= 1.0:
-                top_k = int(len(self.configs) * top_k)
-            if len(pruned_configs) > top_k:
-                est_timing = {
-                    config: self.perf_model(
-                        **self.nargs,
-                        **kwargs,
-                        **config.kwargs,
-                        num_stages=config.num_stages,
-                        num_warps=config.num_warps,
-                    )
-                    for config in pruned_configs
-                }
-                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[
-                    :top_k
-                ]
-        return pruned_configs
-    def warmup(self, *args, **kwargs):
-        self.nargs = dict(zip(self.arg_names, args))
-        for config in self.prune_configs(kwargs):
-            self.fn.warmup(
-                *args,
-                num_warps=config.num_warps,
-                num_stages=config.num_stages,
-                **kwargs,
-                **config.kwargs,
-            )
-        self.nargs = None
-def autotune(
-    configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False
-):
-    """
-    Decorator for auto-tuning a :code:`triton.jit`'d function.
-    .. highlight:: python
-    .. code-block:: python
-            @triton.autotune(configs=[
-                    triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
-                    triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
-                    ],
-                    key=['x_size'] # the two above configs will be evaluated anytime
-                                                    # the value of x_size changes
-            )
-            @triton.jit
-            def kernel(x_ptr, x_size, **META):
-                    BLOCK_SIZE = META['BLOCK_SIZE']
-    :note: When all the configurations are evaluated, the kernel will run multiple time.
-                    This means that whatever value the kernel updates will be updated multiple times.
-                    To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
-                    reset the value of the provided tensor to `zero` before running any configuration.
-    :param configs: a list of :code:`triton.Config` objects
-    :type configs: list[triton.Config]
-    :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
-    :type key: list[str]
-    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
-            'perf_model': performance model used to predicate running time with different configs, returns running time
-            'top_k': number of configs to bench
-            'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
-    :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
-    :type reset_to_zero: list[str]
-    """
-    def decorator(fn):
-        return Autotuner(
-            fn,
-            fn.arg_names,
-            configs,
-            key,
-            reset_to_zero,
-            prune_configs_by,
-            nearest_power_of_two,
-        )
-    return decorator
-def matmul248_kernel_config_pruner(configs, nargs):
-    """
-    The main purpose of this function is to shrink BLOCK_SIZE_* when the corresponding dimension is smaller.
-    """
-    m = max(2 ** int(math.ceil(math.log2(nargs["M"]))), 16)
-    n = max(2 ** int(math.ceil(math.log2(nargs["N"]))), 16)
-    k = max(2 ** int(math.ceil(math.log2(nargs["K"]))), 16)
-    used = set()
-    for config in configs:
-        block_size_m = min(m, config.kwargs["BLOCK_SIZE_M"])
-        block_size_n = min(n, config.kwargs["BLOCK_SIZE_N"])
-        block_size_k = min(k, config.kwargs["BLOCK_SIZE_K"])
-        group_size_m = config.kwargs["GROUP_SIZE_M"]
-        if (
-            block_size_m,
-            block_size_n,
-            block_size_k,
-            group_size_m,
-            config.num_stages,
-            config.num_warps,
-        ) in used:
-            continue
-        used.add(
-            (
-                block_size_m,
-                block_size_n,
-                block_size_k,
-                group_size_m,
-                config.num_stages,
-                config.num_warps,
-            )
-        )
-        yield triton.Config(
-            {
-                "BLOCK_SIZE_M": block_size_m,
-                "BLOCK_SIZE_N": block_size_n,
-                "BLOCK_SIZE_K": block_size_k,
-                "GROUP_SIZE_M": group_size_m,
-            },
-            num_stages=config.num_stages,
-            num_warps=config.num_warps,
-        )
--- a/server/text_generation_server/utils/gptq/exllama.py
+++ b/server/text_generation_server/utils/gptq/exllama.py
-import torch
-from exllama_kernels import make_q4, q4_matmul, prepare_buffers, set_tuning_params
-# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
-none_tensor = torch.empty((1, 1), device="meta")
-def ext_make_q4(qweight, qzeros, scales, g_idx, device):
-    """Construct Q4Matrix, return handle"""
-    return make_q4(
-        qweight, qzeros, scales, g_idx if g_idx is not None else none_tensor, device
-    )
-def ext_q4_matmul(x, q4, q4_width):
-    """Matrix multiplication, returns x @ q4"""
-    outshape = x.shape[:-1] + (q4_width,)
-    x = x.view(-1, x.shape[-1])
-    output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
-    q4_matmul(x, q4, output)
-    return output.view(outshape)
-MAX_DQ = 1
-MAX_INNER = 1
-ACT_ORDER = False
-DEVICE = None
-TEMP_STATE = None
-TEMP_DQ = None
-def set_device(device):
-    global DEVICE
-    DEVICE = device
-def create_exllama_buffers(max_total_tokens: int):
-    global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE, TEMP_STATE, TEMP_DQ
-    assert DEVICE is not None, "call set_device first"
-    if not ACT_ORDER:
-        max_total_tokens = 1
-    # This temp_state buffer is required to reorder X in the act-order case.
-    temp_state = torch.zeros(
-        (max_total_tokens, MAX_INNER), dtype=torch.float16, device=DEVICE
-    )
-    temp_dq = torch.zeros((1, MAX_DQ), dtype=torch.float16, device=DEVICE)
-    # This temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
-    prepare_buffers(DEVICE, temp_state, temp_dq)
-    matmul_recons_thd = 8
-    matmul_fused_remap = False
-    matmul_no_half2 = False
-    set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
-    TEMP_STATE, TEMP_DQ = temp_state, temp_dq
-class Ex4bitLinear(torch.nn.Module):
-    """Linear layer implementation with per-group 4-bit quantization of the weights"""
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
-        assert bits == 4
-        self.device = qweight.device
-        self.qweight = qweight
-        self.qzeros = qzeros
-        self.scales = scales
-        self.g_idx = g_idx.cpu() if g_idx is not None else None
-        self.bias = bias if bias is not None else None
-        if self.g_idx is not None and (
-            (self.g_idx == 0).all()
-            or torch.equal(
-                g_idx.cpu(),
-                torch.tensor(
-                    [i // groupsize for i in range(g_idx.shape[0])], dtype=torch.int32
-                ),
-            )
-        ):
-            self.empty_g_idx = True
-            self.g_idx = None
-        assert self.device.type == "cuda"
-        assert self.device.index is not None
-        self.q4 = ext_make_q4(
-            self.qweight, self.qzeros, self.scales, self.g_idx, self.device.index
-        )
-        self.height = qweight.shape[0] * 8
-        self.width = qweight.shape[1]
-        # Infer groupsize from height of qzeros
-        self.groupsize = None
-        if self.qzeros.shape[0] > 1:
-            self.groupsize = (self.qweight.shape[0] * 8) // (self.qzeros.shape[0])
-        if self.groupsize is not None:
-            assert groupsize == self.groupsize
-        # Handle act-order matrix
-        if self.g_idx is not None:
-            if self.groupsize is None:
-                raise ValueError("Found group index but no groupsize. What do?")
-            self.act_order = True
-        else:
-            self.act_order = False
-        DEVICE = self.qweight.device
-        MAX_DQ = max(MAX_DQ, self.qweight.numel() * 8)
-        if self.act_order:
-            MAX_INNER = max(MAX_INNER, self.height, self.width)
-            ACT_ORDER = True
-    def forward(self, x):
-        out = ext_q4_matmul(x, self.q4, self.width)
-        if self.bias is not None:
-            out.add_(self.bias)
-        return out
--- a/server/text_generation_server/utils/gptq/exllamav2.py
+++ b/server/text_generation_server/utils/gptq/exllamav2.py
-# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
-import torch
-import torch.nn as nn
-from loguru import logger
-try:
-    from exllamav2_kernels import make_q_matrix, gemm_half_q_half
-except ImportError:
-    logger.error("exllamav2_kernels not installed.")
-    raise
-# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
-none_tensor = torch.empty((1, 1), device="meta")
-def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
-    """Matrix multiplication, returns x @ q4"""
-    output_shape = x.shape[:-1] + (q4_width,)
-    x = x.view(-1, x.shape[-1])
-    output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
-    gemm_half_q_half(x, q_handle, output, force_cuda)
-    return output.view(output_shape)
-# Group map needed for irregular group sizes
-def make_group_map(q_groups, num_qrows):
-    gr = q_groups.tolist()
-    group_map = []
-    num_groups = len(gr) // 2
-    for i in range(num_groups):
-        bits = gr[i * 2]
-        if i < num_groups - 1:
-            qrows = gr[i * 2 + 3] - gr[i * 2 + 1]
-        else:
-            qrows = num_qrows - gr[i * 2 + 1]
-        rows = qrows * 32 // bits
-        for j in range(rows):
-            group_map += [i]
-            group_map += [rows - j]
-    return torch.tensor(group_map, dtype=torch.short, device=q_groups.device)
-# Create Q matrix
-def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
-    """
-    Create Q matrix
-    """
-    # EXL2
-    # won't work as the moment because the tensors are not the same.
-    if "q_weight" in w:
-        w["q_scale_max"] /= 256
-        w["q_perm"] = w["q_perm"].short()
-        w["q_invperm"] = w["q_invperm"].short()
-        if "q_group_map" not in w:
-            w["q_group_map"] = make_group_map(w["q_groups"], w["q_weight"].shape[0])
-        return make_q_matrix(
-            w["q_weight"],
-            w["q_perm"],
-            w["q_invperm"],
-            w["q_scale"],
-            w["q_scale_max"],
-            w["q_groups"],
-            w["q_group_map"],
-            none_tensor,
-            none_tensor,
-            none_tensor,
-            temp_dq,
-        )
-    # GPTQ
-    elif "qweight" in w:
-        if w["scales"].dtype == torch.float:
-            w["scales"] = w["scales"].half()
-        # GPTQ with g_idx (act_order)
-        if w.get("g_idx", None) is not None and not (w["g_idx"] == 0).all().item():
-            w["q_perm"] = torch.empty(
-                (w["qweight"].shape[0] * 8,),
-                dtype=torch.short,
-                device=w["qweight"].device,
-            )
-            w["q_invperm"] = torch.empty_like(w["q_perm"])
-            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
-            return make_q_matrix(
-                w["qweight"],
-                w["q_perm"],
-                w["q_invperm"],
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                w["qzeros"],
-                w["scales"],
-                w["g_idx"].cpu(),
-                temp_dq,
-            )
-        # GPTQ without g_idx
-        else:
-            return make_q_matrix(
-                w["qweight"],
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                w["qzeros"],
-                w["scales"],
-                none_tensor,
-                temp_dq,
-            )
-DEVICE = None
-FIXED_BYTES = 0
-LAYERS = []
-def set_device(device):
-    global DEVICE
-    DEVICE = device
-def create_exllama_buffers(max_total_tokens: int):
-    global FIXED_BYTES, LAYERS, DEVICE
-    temp_dq = ExLlamaV2DeviceTensors(DEVICE, FIXED_BYTES)
-    for layer in LAYERS:
-        layer.post_init(temp_dq)
-class QuantLinear(nn.Module):
-    QUANT_TYPE = "exllamav2"
-    """Linear layer implementation with per-group 4-bit quantization of the weights"""
-    # def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        if bits != 4:
-            raise ValueError(
-                f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization."
-            )
-        self.q_handle = None
-        self.q_tensors = None
-        self.bits = bits
-        self.maxq = 2**self.bits - 1
-        self.infeatures = qweight.shape[0] // self.bits * 32
-        self.outfeatures = qweight.shape[1]
-        self.padding = -self.outfeatures % 32
-        self.outfeatures = self.outfeatures + self.padding
-        self.device = qweight.device
-        self.qweight = qweight
-        self.qzeros = qzeros
-        self.scales = scales
-        self.g_idx = g_idx
-        self.bias = bias if bias is not None else None
-        self.group_size = groupsize
-        global FIXED_BYTES, LAYERS
-        FIXED_BYTES = max(FIXED_BYTES, self.scratch_space_fixed())
-        LAYERS.append(self)
-    def post_init(self, temp_dq):
-        assert self.qweight.device.type == "cuda"
-        assert self.qweight.device.index is not None
-        self.q_tensors = {
-            "qweight": self.qweight,
-            "qzeros": self.qzeros,
-            "scales": self.scales,
-            "g_idx": self.g_idx,
-        }
-        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
-        # We NEED to keep a pointer on Python side, otherwise the garbage collector will mess with us,
-        # and `Memory access fault by GPU node-2` will EAT you.
-        self.temp_dq = temp_dq
-        self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq)
-    def forward(self, x, force_cuda=False):
-        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
-        if self.bias is not None:
-            output.add_(self.bias)
-        return output
-    def temp_dq_size(self):
-        return self.infeatures * self.outfeatures * 2 + 128
-    def temp_fwd_size(self, max_input_len, max_batch_size):
-        return self.outfeatures * max_input_len * max_batch_size * 4 + 128
-    def scratch_space_fixed(self, max_input_len=4096, max_batch_size=16):
-        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
-class ExLlamaV2DeviceTensors:
-    device_idx: int
-    scratch_bytes: int
-    scratch_idx: int
-    scratch: torch.tensor = None
-    def __init__(self, device, scratch_bytes):
-        self.device = device
-        self.scratch_bytes = scratch_bytes
-    def prepare(self):
-        self.scratch = torch.empty(
-            (self.scratch_bytes // 2,), dtype=torch.half, device=self.device
-        )
-    def get_scratch_slice(self, size_bytes):
-        if self.scratch is None:
-            self.prepare()
-        size_bytes = ((size_bytes + 127) // 128) * 128
-        size_half = size_bytes // 2
-        scratch_slice = self.scratch.narrow(0, 0, size_half)
-        return scratch_slice
--- a/server/text_generation_server/utils/gptq/quant_linear.py
+++ b/server/text_generation_server/utils/gptq/quant_linear.py
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.cuda.amp import custom_bwd, custom_fwd
-try:
-    import triton
-    import triton.language as tl
-    from . import custom_autotune
-    # code based https://github.com/fpgaminer/GPTQ-triton
-    @custom_autotune.autotune(
-        configs=[
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 256,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 64,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 32,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 128,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=4,
-            ),
-        ],
-        key=["M", "N", "K"],
-        nearest_power_of_two=True,
-        prune_configs_by={
-            "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
-            "perf_model": None,
-            "top_k": None,
-        },
-    )
-    @triton.jit
-    def matmul_248_kernel(
-        a_ptr,
-        b_ptr,
-        c_ptr,
-        scales_ptr,
-        zeros_ptr,
-        g_ptr,
-        M,
-        N,
-        K,
-        bits,
-        maxq,
-        stride_am,
-        stride_ak,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_scales,
-        stride_zeros,
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-    ):
-        """
-        Compute the matrix multiplication C = A x B.
-        A is of shape (M, K) float16
-        B is of shape (K//8, N) int32
-        C is of shape (M, N) float16
-        scales is of shape (G, N) float16
-        zeros is of shape (G, N) float16
-        g_ptr is of shape (K) int32
-        """
-        infearure_per_bits = 32 // bits
-        pid = tl.program_id(axis=0)
-        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-        num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-        num_pid_in_group = GROUP_SIZE_M * num_pid_n
-        group_id = pid // num_pid_in_group
-        first_pid_m = group_id * GROUP_SIZE_M
-        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-        pid_m = first_pid_m + (pid % group_size_m)
-        pid_n = (pid % num_pid_in_group) // group_size_m
-        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        offs_k = tl.arange(0, BLOCK_SIZE_K)
-        a_ptrs = a_ptr + (
-            offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
-        )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        a_mask = offs_am[:, None] < M
-        # b_ptrs is set up such that it repeats elements along the K axis 8 times
-        b_ptrs = b_ptr + (
-            (offs_k[:, None] // infearure_per_bits) * stride_bk
-            + offs_bn[None, :] * stride_bn
-        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-        g_ptrs = g_ptr + offs_k
-        # shifter is used to extract the N bits of each element in the 32-bit word from B
-        scales_ptrs = scales_ptr + offs_bn[None, :]
-        zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
-        shifter = (offs_k % infearure_per_bits) * bits
-        zeros_shifter = (offs_bn % infearure_per_bits) * bits
-        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, num_pid_k):
-            g_idx = tl.load(g_ptrs)
-            # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-            scales = tl.load(
-                scales_ptrs + g_idx[:, None] * stride_scales
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-            zeros = tl.load(
-                zeros_ptrs + g_idx[:, None] * stride_zeros
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-            zeros = (zeros >> zeros_shifter[None, :]) & maxq
-            zeros = (zeros + 1) & maxq  # eventually avoid overflow
-            a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-            b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-            # Now we need to unpack b (which is N-bit values) into 32-bit values
-            b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-            b = (b - zeros) * scales  # Scale and shift
-            accumulator += tl.dot(a, b)
-            a_ptrs += BLOCK_SIZE_K
-            b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
-            g_ptrs += BLOCK_SIZE_K
-        c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
-        c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
-        tl.store(c_ptrs, accumulator, mask=c_mask)
-except:
-    print("triton not installed.")
-def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    with torch.cuda.device(input.device):
-        output = torch.empty(
-            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
-        )
-        grid = lambda META: (
-            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
-            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
-        )
-        matmul_248_kernel[grid](
-            input,
-            qweight,
-            output,
-            scales,
-            qzeros,
-            g_idx,
-            input.shape[0],
-            qweight.shape[1],
-            input.shape[1],
-            bits,
-            maxq,
-            input.stride(0),
-            input.stride(1),
-            qweight.stride(0),
-            qweight.stride(1),
-            output.stride(0),
-            output.stride(1),
-            scales.stride(0),
-            qzeros.stride(0),
-        )
-        return output
-class QuantLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
-        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
-        return output
-class QuantLinear(nn.Module):
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        self.register_buffer("qweight", qweight)
-        self.register_buffer("qzeros", qzeros)
-        self.register_buffer("scales", scales)
-        self.register_buffer("g_idx", g_idx)
-        if bias is not None:
-            self.register_buffer("bias", bias)
-        else:
-            self.bias = None
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        self.bits = bits
-        self.maxq = 2**self.bits - 1
-        self.groupsize = groupsize
-        self.outfeatures = qweight.shape[1]
-        self.infeatures = qweight.shape[0] * 32 // bits
-    @classmethod
-    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
-        qzeros = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
-            dtype=torch.int32,
-        )
-        scales = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
-        )
-        g_idx = torch.tensor(
-            [i // groupsize for i in range(infeatures)], dtype=torch.int32
-        )
-        if bias:
-            bias = torch.zeros((outfeatures), dtype=torch.float16)
-        else:
-            bias = None
-        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
-    def pack(self, linear, scales, zeros, g_idx=None):
-        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
-        scales = scales.t().contiguous()
-        zeros = zeros.t().contiguous()
-        scale_zeros = zeros * scales
-        self.scales = scales.clone().half()
-        if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-        intweight = []
-        for idx in range(self.infeatures):
-            intweight.append(
-                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
-                    / self.scales[self.g_idx[idx]]
-                ).to(torch.int)[:, None]
-            )
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.numpy().astype(np.uint32)
-        qweight = np.zeros(
-            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
-        )
-        i = 0
-        row = 0
-        while row < qweight.shape[0]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qweight[row] |= intweight[j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                row += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight)
-        zeros -= 1
-        zeros = zeros.numpy().astype(np.uint32)
-        qzeros = np.zeros(
-            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
-        )
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                col += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-        qzeros = qzeros.astype(np.int32)
-        self.qzeros = torch.from_numpy(qzeros)
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(
-            x.reshape(-1, x.shape[-1]),
-            self.qweight,
-            self.scales,
-            self.qzeros,
-            self.g_idx,
-            self.bits,
-            self.maxq,
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
--- a/server/text_generation_server/utils/gptq/quantize.py
+++ b/server/text_generation_server/utils/gptq/quantize.py
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
--- a/server/text_generation_server/utils/paged_attention.py
+++ b/server/text_generation_server/utils/paged_attention.py
-import torch
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
-_PARTITION_SIZE = 512
-if IS_XPU_SYSTEM:
-    import intel_extension_for_pytorch as ipex
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    if IS_CUDA_SYSTEM:
-        from vllm._C import cache_ops
-        cache_ops.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, "auto", 1.0
-        )
-    elif IS_ROCM_SYSTEM:
-        from vllm import cache_ops
-        # cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots)
-        cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots.int())
-    elif IS_XPU_SYSTEM:
-        ipex.llm.modules.PagedAttention.reshape_and_cache(
-            key, value, key_cache, value_cache, slots
-        )
-    else:
-        raise ValueError("vllm is not supported on your system")
-def attention(
-    out: torch.Tensor,
-    query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    kv_head_mapping: torch.Tensor,
-    softmax_scale: float,
-    block_tables: torch.Tensor,
-    input_lengths: torch.Tensor,
-    max_s: int,
-):
-    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
-    # Copyright 2023 The vLLM team. All rights
-    # reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    #
-    # value_cache => [num_blocks, num_heads, head_size, block_size]
-    block_size = value_cache.shape[3]
-    num_seqs, num_heads, head_size = query.shape
-    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-    if IS_XPU_SYSTEM:
-        query = query.contiguous()
-        return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
-            out,
-            query,
-            key_cache,
-            value_cache,
-            kv_head_mapping,
-            softmax_scale,
-            block_tables,
-            input_lengths,
-            block_size,
-            max_s,
-            None,
-        )
-    # NOTE(woosuk): We use a simple heuristic to decide whether to use
-    # PagedAttention V1 or V2. If the number of partitions is 1, we use
-    # V1 to avoid the overhead of reduction. Also, if the number of
-    # sequences or heads is large, we use V1 since there is enough work
-    # to parallelize.
-    use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)
-    if use_v1:
-        if IS_CUDA_SYSTEM:
-            from vllm._C import ops
-            ops.paged_attention_v1(
-                out,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-                "auto",
-                1.0,
-            )
-        elif IS_ROCM_SYSTEM:
-            from vllm import attention_ops
-            attention_ops.paged_attention_v1(
-                out,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-            )
-        else:
-            raise ValueError("vllm is not supported on your system")
-    else:
-        # Run PagedAttention V2.
-        assert _PARTITION_SIZE % block_size == 0
-        tmp_output = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions, head_size),
-            dtype=out.dtype,
-            device=out.device,
-        )
-        exp_sums = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions),
-            dtype=torch.float32,
-            device=out.device,
-        )
-        max_logits = torch.empty_like(exp_sums)
-        if IS_CUDA_SYSTEM:
-            from vllm._C import ops
-            ops.paged_attention_v2(
-                out,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-                "auto",
-                1.0,
-            )
-        elif IS_ROCM_SYSTEM:
-            from vllm import attention_ops
-            attention_ops.paged_attention_v2(
-                out,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-            )
-        else:
-            raise ValueError("vllm is not supported on your system")
--- a/server/vllm/.pylintrc
+++ b/server/vllm/.pylintrc
-# This Pylint rcfile contains a best-effort configuration to uphold the
-# best-practices and style described in the Google Python style guide:
-#   https://google.github.io/styleguide/pyguide.html
-#
-# Its canonical open-source location is:
-#   https://google.github.io/styleguide/pylintrc
-[MASTER]
-# Files or directories to be skipped. They should be base names, not paths.
-ignore=docs
-# Files or directories matching the regex patterns are skipped. The regex
-# matches against base names, not paths.
-ignore-patterns=
-# Pickle collected data for later comparisons.
-persistent=no
-# List of plugins (as comma separated values of python modules names) to load,
-# usually to register additional checkers.
-load-plugins=
-# Use multiple processes to speed up Pylint.
-jobs=4
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-[MESSAGES CONTROL]
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
-confidence=
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-#enable=
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once).You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use"--disable=all --enable=classes
-# --disable=W"
-disable=abstract-method,
-        apply-builtin,
-        arguments-differ,
-        attribute-defined-outside-init,
-        backtick,
-        bad-option-value,
-        basestring-builtin,
-        buffer-builtin,
-        c-extension-no-member,
-        consider-using-enumerate,
-        cmp-builtin,
-        cmp-method,
-        coerce-builtin,
-        coerce-method,
-        delslice-method,
-        div-method,
-        duplicate-code,
-        eq-without-hash,
-        execfile-builtin,
-        file-builtin,
-        filter-builtin-not-iterating,
-        fixme,
-        getslice-method,
-        global-statement,
-        hex-method,
-        idiv-method,
-        implicit-str-concat-in-sequence,
-        import-error,
-        import-self,
-        import-star-module-level,
-        inconsistent-return-statements,
-        input-builtin,
-        intern-builtin,
-        invalid-str-codec,
-        locally-disabled,
-        logging-fstring-interpolation,  # added by vLLM
-        logging-not-lazy,  # added by vLLM
-        long-builtin,
-        long-suffix,
-        map-builtin-not-iterating,
-        misplaced-comparison-constant,
-        missing-class-docstring,  # TODO (vLLM): enable
-        missing-function-docstring,
-        missing-module-docstring,  # TODO (vLLM): enable
-        metaclass-assignment,
-        next-method-called,
-        next-method-defined,
-        no-absolute-import,
-        no-else-break,
-        no-else-continue,
-        no-else-raise,
-        no-else-return,
-        no-init,  # added
-        no-member,
-        no-name-in-module,
-        no-self-use,
-        nonzero-method,
-        oct-method,
-        old-division,
-        old-ne-operator,
-        old-octal-literal,
-        old-raise-syntax,
-        parameter-unpacking,
-        print-statement,
-        raising-string,
-        range-builtin-not-iterating,
-        raw_input-builtin,
-        rdiv-method,
-        reduce-builtin,
-        relative-import,
-        reload-builtin,
-        round-builtin,
-        setslice-method,
-        signature-differs,
-        standarderror-builtin,
-        suppressed-message,
-        sys-max-int,
-        too-few-public-methods,
-        too-many-ancestors,
-        too-many-arguments,
-        too-many-boolean-expressions,
-        too-many-branches,
-        too-many-instance-attributes,
-        too-many-locals,
-        too-many-nested-blocks,
-        too-many-public-methods,
-        too-many-return-statements,
-        too-many-statements,
-        trailing-newlines,
-        unichr-builtin,
-        unicode-builtin,
-        unnecessary-pass,
-        unpacking-in-except,
-        unspecified-encoding,
-        useless-else-on-loop,
-        useless-object-inheritance,
-        useless-suppression,
-        using-cmp-argument,
-        wrong-import-order,
-        xrange-builtin,
-        zip-builtin-not-iterating,
-[REPORTS]
-# Set the output format. Available formats are text, parseable, colorized, msvs
-# (visual studio) and html. You can also give a reporter class, eg
-# mypackage.mymodule.MyReporterClass.
-output-format=text
-# Tells whether to display a full report or only the messages
-reports=no
-# Python expression which should return a note less than 10 (10 is the highest
-# note). You have access to the variables errors warning, statement which
-# respectively contain the number of errors / warnings messages and the total
-# number of statements analyzed. This is used by the global evaluation report
-# (RP0004).
-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details
-#msg-template=
-[BASIC]
-# Good variable names which should always be accepted, separated by a comma
-good-names=main,_
-# Bad variable names which should always be refused, separated by a comma
-bad-names=
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-# Include a hint for the correct naming format with invalid-name
-include-naming-hint=no
-# List of decorators that produce properties, such as abc.abstractproperty. Add
-# to this list to register other decorators that produce valid properties.
-property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
-# Regular expression matching correct function names
-function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
-# Regular expression matching correct variable names
-variable-rgx=^[a-z][a-z0-9_]*$
-# Regular expression matching correct constant names
-const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-# Regular expression matching correct attribute names
-attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
-# Regular expression matching correct argument names
-argument-rgx=^[a-z][a-z0-9_]*$
-# Regular expression matching correct class attribute names
-class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-# Regular expression matching correct inline iteration names
-inlinevar-rgx=^[a-z][a-z0-9_]*$
-# Regular expression matching correct class names
-class-rgx=^_?[A-Z][a-zA-Z0-9]*$
-# Regular expression matching correct module names
-module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
-# Regular expression matching correct method names
-method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=10
-[TYPECHECK]
-# List of decorators that produce context managers, such as
-# contextlib.contextmanager. Add to this list to register other decorators that
-# produce valid context managers.
-contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
-# Tells whether missing members accessed in mixin class should be ignored. A
-# mixin class is detected if its name ends with "mixin" (case insensitive).
-ignore-mixin-members=yes
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis. It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=
-[FORMAT]
-# Maximum number of characters on a single line.
-max-line-length=80
-# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
-# lines made too long by directives to pytype.
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=(?x)(
-  ^\s*(\#\ )?<?https?://\S+>?$|
-  ^\s*(from\s+\S+\s+)?import\s+.+$)
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=yes
-# Maximum number of lines in a module
-max-module-lines=99999
-# String used as indentation unit.  The internal Google style guide mandates 2
-# spaces.  Google's externaly-published style guide says 4, consistent with
-# PEP 8.  Here, we use 2 spaces, for conformity with many open-sourced Google
-# projects (like TensorFlow).
-indent-string='    '
-# Number of spaces of indent required inside a hanging  or continued line.
-indent-after-paren=4
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
-[MISCELLANEOUS]
-# List of note tags to take in consideration, separated by a comma.
-notes=TODO
-[STRING]
-# This flag controls whether inconsistent-quotes generates a warning when the
-# character used as a quote delimiter is used inconsistently within a module.
-check-quote-consistency=yes
-[VARIABLES]
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-# A regular expression matching the name of dummy variables (i.e. expectedly
-# not used).
-dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid to define new builtins when possible.
-additional-builtins=
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,_cb
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
-[LOGGING]
-# Logging modules to check that the string format arguments are in logging
-# function parameter format
-logging-modules=logging,absl.logging,tensorflow.io.logging
-[SIMILARITIES]
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-# Ignore comments when computing similarities.
-ignore-comments=yes
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-# Ignore imports when computing similarities.
-ignore-imports=no
-[SPELLING]
-# Spelling dictionary name. Available dictionaries: none. To make it working
-# install python-enchant package.
-spelling-dict=
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-# A path to a file that contains private dictionary; one word per line.
-spelling-private-dict-file=
-# Tells whether to store unknown words to indicated private dictionary in
-# --spelling-private-dict-file option instead of raising a message.
-spelling-store-unknown-words=no
-[IMPORTS]
-# Deprecated modules which should not be used, separated by a comma
-deprecated-modules=regsub,
-                   TERMIOS,
-                   Bastion,
-                   rexec,
-                   sets
-# Create a graph of every (i.e. internal and external) dependencies in the
-# given file (report RP0402 must not be disabled)
-import-graph=
-# Create a graph of external dependencies in the given file (report RP0402 must
-# not be disabled)
-ext-import-graph=
-# Create a graph of internal dependencies in the given file (report RP0402 must
-# not be disabled)
-int-import-graph=
-# Force import order to recognize a module as part of the standard
-# compatibility libraries.
-known-standard-library=
-# Force import order to recognize a module as part of a third party library.
-known-third-party=enchant, absl
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-[CLASSES]
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,
-                      __new__,
-                      setUp
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,
-                  _fields,
-                  _replace,
-                  _source,
-                  _make
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls,
-                            class_
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=mcs
-[EXCEPTIONS]
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=StandardError,
-                       Exception,
-                       BaseException
--- a/server/vllm/.readthedocs.yaml
+++ b/server/vllm/.readthedocs.yaml
-# Read the Docs configuration file
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-version: 2
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.8"
-sphinx:
-   configuration: docs/source/conf.py
-# If using Sphinx, optionally build your docs in additional formats such as PDF
-formats:
-   - pdf
-# Optionally declare the Python requirements required to build your docs
-python:
-   install:
-   - requirements: docs/requirements-docs.txt