feat(src): add kv cache int8 quantization (#22)

* feat(src): add int8 and compile passed * feat(kernels): fix * feat(llama): update kernel * feat(src): add debug * fix(kernel): k_cache use int8_t pointer * style(llama): clean code * feat(deploy.py): revert to enable fmha * style(LlamaV2): clean code * feat(deploy.py): add default quant policy

feat(src): add kv cache int8 quantization (#22)
* feat(src): add int8 and compile passed * feat(kernels): fix * feat(llama): update kernel * feat(src): add debug * fix(kernel): k_cache use int8_t pointer * style(llama): clean code * feat(deploy.py): revert to enable fmha * style(LlamaV2): clean code * feat(deploy.py): add default quant policy
cc93136e · tpoisonooo · GitHub · 4d42a781 · cc93136e · cc93136e
Unverified Commit cc93136e authored Jun 28, 2023 by tpoisonooo Committed by GitHub Jun 28, 2023
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,18 +17,18 @@ project(FasterTransformer LANGUAGES CXX CUDA)
 find_package(CUDA 10.2 REQUIRED)
-if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
+# if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
-  add_definitions("-DENABLE_BF16")
+#   add_definitions("-DENABLE_BF16")
-  message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag")
+#   message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag")
-endif()
+# endif()
-if((${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11" AND ${CUDA_VERSION_MINOR} VERSION_GREATER_EQUAL "8") OR (${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "12"))
+# if((${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11" AND ${CUDA_VERSION_MINOR} VERSION_GREATER_EQUAL "8") OR (${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "12"))
-  add_definitions("-DENABLE_FP8")
+#   add_definitions("-DENABLE_FP8")
-  option(ENABLE_FP8 "ENABLE_FP8" OFF)
+#   option(ENABLE_FP8 "ENABLE_FP8" OFF)
-  if(ENABLE_FP8)
+#   if(ENABLE_FP8)
-    message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag")
+#     message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag")
-  endif()
+#   endif()
-endif()
+# endif()
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)

--- a/README.md
+++ b/README.md
@@ -158,12 +158,19 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
 python3 llmdeploy/serve/client.py {server_ip_addresss}:33337 1
 ```
-## User Guide
+## Quantization
+In fp16 mode, kv_cache int8 quantization can be enabled, and a single card can serve more users.
+First execute the quantization script, and the quantization parameters are stored in the weight directory transformed by `deploy.py`.
+Then adjust `config.ini`
+* `use_context_fmha` changed to 0, means off
+* `quant_policy` is set to 4. This parameter defaults to 0, which means it is not enabled
 ## Contributing
 We appreciate all contributions to LLMDeploy. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
 ## Acknowledgement
 - [FasterTransformer](https://github.com/NVIDIA/FasterTransformer)

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -68,7 +68,7 @@ pip install -e .
 ```shell
 python3 llmdeploy/serve/fastertransformer/deploy.py llama-7B /path/to/llama-7b llama \
    --tokenizer_path /path/to/tokenizer/model
-bash workspace/service_docker_up.sh
+bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 </details>
@@ -79,7 +79,7 @@ bash workspace/service_docker_up.sh
 ```shell
 python3 llmdeploy/serve/fastertransformer/deploy.py llama-13B /path/to/llama-13b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 2
-bash workspace/service_docker_up.sh
+bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 </details>
@@ -90,7 +90,7 @@ bash workspace/service_docker_up.sh
 ```shell
 python3 llmdeploy/serve/fastertransformer/deploy.py llama-33B /path/to/llama-33b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 4
-bash workspace/service_docker_up.sh
+bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 </details>
@@ -101,7 +101,7 @@ bash workspace/service_docker_up.sh
 ```shell
 python3 llmdeploy/serve/fastertransformer/deploy.py llama-65B /path/to/llama-65b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 8
-bash workspace/service_docker_up.sh
+bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 </details>
@@ -119,7 +119,7 @@ python3 -m fastchat.model.apply_delta \
  --delta-path lmsys/vicuna-7b-delta-v1.1
 python3 llmdeploy/serve/fastertransformer/deploy.py vicuna-7B /path/to/vicuna-7b hf
-bash workspace/service_docker_up.sh
+bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 </details>
@@ -135,7 +135,7 @@ python3 -m fastchat.model.apply_delta \
  --delta-path lmsys/vicuna-13b-delta-v1.1
 python3 llmdeploy/serve/fastertransformer/deploy.py vicuna-13B /path/to/vicuna-13b hf
-bash workspace/service_docker_up.sh
+bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 </details>
@@ -146,6 +146,13 @@ bash workspace/service_docker_up.sh
 python3 llmdeploy/serve/client.py {server_ip_addresss}:33337 1
 ```
+## 量化部署
+在 fp16 模式下，可以开启 kv_cache int8 量化，单卡可服务更多用户。
+首先执行量化脚本，量化参数存放到 `deploy.py` 转换的 weight 目录下。
+然后调整 `config.ini`
+* `use_context_fmha` 改为 0，表示关闭
+* `quant_policy` 设置为 4。此参数默认为 0，表示不开启
 ## 贡献指南
 我们感谢所有的贡献者为改进和提升 LLMDeploy 所作出的努力。请参考[贡献指南](.github/CONTRIBUTING.md)来了解参与项目贡献的相关指引。

--- a/llmdeploy/serve/fastertransformer/deploy.py
+++ b/llmdeploy/serve/fastertransformer/deploy.py
@@ -147,7 +147,8 @@ def export(model_name: str,
        step_length=1,
        cache_max_entry_count=48,
        cache_chunk_size=8,
-        use_context_fmha=1))
+        use_context_fmha=1,
+        quant_policy=0))
    config = configparser.ConfigParser()
    for section, key_values in cfg.items():
@@ -301,6 +302,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
    _files = [file for file in os.listdir(model_path) if file.endswith('.bin')]
    _files = sorted(_files)
+    print(_files)
    _params = {}
    for _file in _files:
@@ -369,7 +371,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
    for ft, hf in other:
        model_params[ft] = get_tensor(hf)
-    return export(model_name, i + 1, norm_eps, model_params, tokenizer_path,
+    return export(model_name, num_layer, norm_eps, model_params, tokenizer_path,
                  triton_models_path, tp)

--- a/src/fastertransformer/kernels/decoder_masked_multihead_attention.h
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention.h
@@ -116,6 +116,8 @@ struct Multihead_attention_params_base {
    const float* qkv_scale_out       = nullptr;
    const float* attention_out_scale = nullptr;
    int          int8_mode           = 0;
+    float attention_k_scale = 0.f;
+    float attention_v_scale = 0.f;
 };
 template<typename T>

--- a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
--- a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
@@ -195,9 +195,12 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                        max_seq_len,
                        size_per_head_,
                        local_head_num_,
-                        stream_);
+                        stream_,
-    sync_check_cuda_error();
+                        quant_policy_,
+                        weights->past_kv_scale.data());
+    sync_check_cuda_error();
    if (use_fmha_) {
        fusedMultiHeadAttention(k_cache_ptrs,
                                v_cache_ptrs,
@@ -220,7 +223,11 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                  num_token,
                                  max_q_len,
                                  max_k_len,
-                                  max_seq_len);
+                                  max_seq_len,
+                                  quant_policy_,
+                                  weights->past_kv_scale.data());
    }
    //////////////////////////////////////////////
@@ -233,6 +240,8 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
        sync_check_cuda_error();
    }
    if (is_free_buffer_after_forward_ == true) {
        freeBuffer();
    }
@@ -303,7 +312,9 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**        key_cac
                                                              int        num_token,
                                                              int        max_q_len,
                                                              int        max_k_len,
-                                                              int        max_seq_len)
+                                                              int        max_seq_len,
+                                                              int           quant,
+                                                              const float*     kv_scale)
 {
    // key_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
    // val_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
@@ -318,7 +329,9 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**        key_cac
                           max_seq_len,
                           size_per_head_,
                           local_head_num_,
-                           stream_);
+                           stream_,
+                           quant,
+                           kv_scale);
    sync_check_cuda_error();
    const T qk_scale = static_cast<T>(1.f / sqrtf(size_per_head_ * 1.f));

--- a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
@@ -42,7 +42,8 @@ public:
                               cublasMMWrapper* cublas_wrapper,
                               IAllocator*      allocator,
                               bool             is_free_buffer_after_forward,
-                               bool             use_fmha):
+                               bool             use_fmha,
+                               int              quant_policy):
        head_num_(head_num),
        size_per_head_(size_per_head),
        hidden_units_(head_num * size_per_head),
@@ -56,7 +57,8 @@ public:
        linear_(cublas_wrapper, stream),
        allocator_(allocator),
        is_free_buffer_after_forward_(is_free_buffer_after_forward),
-        use_fmha_(use_fmha)
+        use_fmha_(use_fmha),
+        quant_policy_(quant_policy)
    {
    }
@@ -72,17 +74,19 @@ public:
                                 int    max_k_len,
                                 int    max_seq_len);
-    void unfusedMultiHeadAttention(T**        key_cache_ptrs,
+    void unfusedMultiHeadAttention(T**          key_cache_ptrs,
-                                   T**        val_cache_ptrs,
+                                   T**          val_cache_ptrs,
-                                   size_t     cache_layer_offset,
+                                   size_t       cache_layer_offset,
-                                   const T*   attention_mask,
+                                   const T*     attention_mask,
-                                   const int* padding_offset,
+                                   const int*   padding_offset,
-                                   const int* context_length,
+                                   const int*   context_length,
-                                   int        batch_size,
+                                   int          batch_size,
-                                   int        num_token,
+                                   int          num_token,
-                                   int        max_q_len,
+                                   int          max_q_len,
-                                   int        max_k_len,
+                                   int          max_k_len,
-                                   int        max_seq_len);
+                                   int          max_seq_len,
+                                   int          quant_policy,
+                                   const float* kv_scale);
 private:
    const size_t head_num_;
@@ -96,6 +100,7 @@ private:
    const bool neox_rotary_style_;
    const bool use_fmha_;
+    const int quant_policy_;
    NcclParam tensor_para_;

--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -62,7 +62,7 @@ void LlamaContextDecoder<T>::freeBuffer()
 }
 template<typename T>
-void LlamaContextDecoder<T>::initialize(bool use_fmha)
+void LlamaContextDecoder<T>::initialize(bool use_fmha, int quant_policy)
 {
    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
@@ -75,7 +75,8 @@ void LlamaContextDecoder<T>::initialize(bool use_fmha)
                                                                 cublas_wrapper_,
                                                                 allocator_,
                                                                 is_free_buffer_after_forward_,
-                                                                 use_fmha);
+                                                                 use_fmha,
+                                                                 quant_policy);
    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
                                           size_per_head_,
@@ -133,7 +134,8 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t           head_num,
                                            cublasMMWrapper* cublas_wrapper,
                                            IAllocator*      allocator,
                                            bool             is_free_buffer_after_forward,
-                                            bool             use_fmha):
+                                            bool             use_fmha,
+                                            int              quant_policy):
    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
    head_num_(head_num),
    size_per_head_(size_per_head),
@@ -145,7 +147,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t           head_num,
    tensor_para_(tensor_para),
    data_type_(getTensorType<T>())
 {
-    initialize(use_fmha);
+    initialize(use_fmha, quant_policy);
 }
 template<typename T>

--- a/src/fastertransformer/models/llama/LlamaContextDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h
@@ -42,7 +42,7 @@ protected:
    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
    void freeBuffer() override;
-    void initialize(bool use_fmha);
+    void initialize(bool use_fmha, int quant_policy);
    size_t head_num_;
    size_t size_per_head_;
@@ -97,7 +97,8 @@ public:
                        cublasMMWrapper* cublas_wrapper,
                        IAllocator*      allocator,
                        bool             is_free_buffer_after_forward,
-                        bool             use_fmha);
+                        bool             use_fmha,
+                        int              quant_policy);
    ~LlamaContextDecoder() override;

--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -37,7 +37,8 @@ LlamaDecoder<T>::LlamaDecoder(size_t           head_num,
                              cudaStream_t     stream,
                              cublasMMWrapper* cublas_wrapper,
                              IAllocator*      allocator,
-                              bool             is_free_buffer_after_forward):
+                              bool             is_free_buffer_after_forward,
+                              int              quant_policy):
    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
    head_num_(head_num),
    size_per_head_(size_per_head),
@@ -50,7 +51,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t           head_num,
    data_type_(getTensorType<T>())
 {
    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    initialize();
+    initialize(quant_policy);
 }
 template<typename T>
@@ -62,7 +63,7 @@ LlamaDecoder<T>::~LlamaDecoder()
 }
 template<typename T>
-void LlamaDecoder<T>::initialize()
+void LlamaDecoder<T>::initialize(int quant_policy)
 {
    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
@@ -74,7 +75,8 @@ void LlamaDecoder<T>::initialize()
                                                                  stream_,
                                                                  cublas_wrapper_,
                                                                  allocator_,
-                                                                  is_free_buffer_after_forward_);
+                                                                  is_free_buffer_after_forward_,
+                                                                  quant_policy);
    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
                                           size_per_head_,

--- a/src/fastertransformer/models/llama/LlamaDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaDecoder.h
@@ -35,7 +35,7 @@ protected:
    void allocateBuffer() override;  // deprecated
    void allocateBuffer(size_t batch_size);
    void freeBuffer() override;
-    void initialize();
+    void initialize(int quant_policy);
    size_t head_num_;
    size_t size_per_head_;
@@ -79,7 +79,9 @@ public:
                 cudaStream_t     stream,
                 cublasMMWrapper* cublas_wrapper,
                 IAllocator*      allocator,
-                 bool             is_free_buffer_after_forward);
+                 bool             is_free_buffer_after_forward,
+                 int              quant_policy),
    ~LlamaDecoder() override;

--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -158,9 +158,20 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
    loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type);
    loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type);
    loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type);
+    // load kv_cache quant scale
+    // if file not exist, get empty vector
+    std::string scale_path = dir_path + ".past_kv_scale." + rank_spec + ".weight";
+    std::ifstream  in(scale_path, std::ios::in);
+    if (in.is_open()) {
+        in.close();
+        self_attn_weights.past_kv_scale = loadArrayFromBin({2}, scale_path);
+    } else {
+        self_attn_weights.past_kv_scale = {};
+    }
 }
 template struct LlamaDecoderLayerWeight<float>;
 template struct LlamaDecoderLayerWeight<half>;
 }  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -17,7 +17,6 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc
 #include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
 #include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
 #include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
@@ -75,6 +74,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
                                                      const float* qkv_scale_out,
                                                      const float* attention_out_scale,
                                                      const int    int8_mode,
+                                                      const float* attention_kv_scale,
                                                      cudaStream_t stream)
 {
    using DataType = typename SATypeConverter<T>::Type;
@@ -98,14 +98,9 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
    // Set the input buffers.
    params.q = reinterpret_cast<const DataType*>(qkv_buf);
-    if (int8_mode != 2) {
+    params.k = reinterpret_cast<const DataType*>(qkv_buf) + hidden_units;
-        params.k = reinterpret_cast<const DataType*>(qkv_buf) + hidden_units;
+    params.v = reinterpret_cast<const DataType*>(qkv_buf) + 2 * hidden_units;
-        params.v = reinterpret_cast<const DataType*>(qkv_buf) + 2 * hidden_units;
-    }
-    else {
-        params.k = reinterpret_cast<const DataType*>(reinterpret_cast<const int8_t*>(qkv_buf) + hidden_units);
-        params.v = reinterpret_cast<const DataType*>(reinterpret_cast<const int8_t*>(qkv_buf) + 2 * hidden_units);
-    }
    params.stride   = 3 * hidden_units;
    params.finished = const_cast<bool*>(finished);
@@ -148,9 +143,10 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
    params.ia3_value_weights = reinterpret_cast<const DataType*>(ia3_value_weights);
    params.int8_mode = int8_mode;
-    if (int8_mode == 2) {
-        params.qkv_scale_out       = qkv_scale_out;
+    if (int8_mode & QuantPolicy::kCacheKVInt8) {
-        params.attention_out_scale = attention_out_scale;
+        params.attention_k_scale = attention_kv_scale[0];
+        params.attention_v_scale = attention_kv_scale[1];
    }
    PUSH_RANGE("scaled dot-product fusion");
@@ -269,7 +265,8 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
        nullptr,  // ia3_value_weights
        nullptr,  // qkv_scale_out
        nullptr,  // attention_out_scale
-        0,        // int8_mode
+        quant_policy_,        // int8_mode
+        weights->past_kv_scale.data(), // attention kv scale
        stream_);
    sync_check_cuda_error();

--- a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
@@ -40,7 +40,8 @@ public:
                                   cudaStream_t     stream,
                                   cublasMMWrapper* cublas_wrapper,
                                   IAllocator*      allocator,
-                                   bool             is_free_buffer_after_forward):
+                                   bool             is_free_buffer_after_forward,
+                                   int              quant_policy):
        head_num_(head_num),
        size_per_head_(size_per_head),
        hidden_units_(head_num * size_per_head),
@@ -52,7 +53,8 @@ public:
        stream_(stream),
        linear_(cublas_wrapper, stream),
        allocator_(allocator),
-        is_free_buffer_after_forward_(is_free_buffer_after_forward)
+        is_free_buffer_after_forward_(is_free_buffer_after_forward),
+        quant_policy_(quant_policy)
    {
    }
@@ -71,6 +73,7 @@ private:
    const size_t local_hidden_units_;
    const size_t rotary_embedding_dim_;
    const bool   is_free_buffer_after_forward_;
+    const int    quant_policy_;
    const bool neox_rotary_style_;

--- a/src/fastertransformer/models/llama/LlamaDenseWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDenseWeight.h
@@ -66,6 +66,7 @@ template<typename T>
 struct LlamaAttentionWeight {
    LlamaDenseWeight<T> qkv;
    LlamaDenseWeight<T> output;
+    std::vector<float> past_kv_scale;
 };
 template<typename T>

--- a/src/fastertransformer/models/llama/LlamaV2.cc
+++ b/src/fastertransformer/models/llama/LlamaV2.cc
@@ -52,6 +52,7 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
                    int                          end_id,
                    int                          cache_max_entry_count,
                    int                          cache_chunk_size,
+                    int                          quant_policy,
                    bool                         use_context_fmha,
                    std::shared_ptr<SharedState> shared_state,
                    LlamaWeight<T>*              weights,
@@ -89,16 +90,26 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
    FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
    FT_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
+    size_t elem_bits = 0;
+    if (quant_policy & QuantPolicy::kCacheKVInt8) {
+        elem_bits = sizeof(int8_t) * 8;
+        if (use_context_fmha) {
+            FT_LOG_ERROR("use_context_fmha not support int8");
+            assert(0);
+        }
+    } else {
+        elem_bits = sizeof(T) * 8;
+    }
    kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_,
                                                        local_head_num_,
                                                        size_per_head_,
                                                        session_len,
-                                                        sizeof(T) * 8,
+                                                        elem_bits,
                                                        cache_max_entry_count,
                                                        cache_chunk_size,
                                                        tensor_para.rank_,
                                                        allocator);
-    initialize(use_context_fmha);
+    initialize(use_context_fmha, quant_policy);
    start();
 }
@@ -113,7 +124,7 @@ LlamaV2<T>::~LlamaV2()
 }
 template<typename T>
-void LlamaV2<T>::initialize(bool use_context_fmha)
+void LlamaV2<T>::initialize(bool use_context_fmha, int quant_policy)
 {
    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
@@ -128,7 +139,8 @@ void LlamaV2<T>::initialize(bool use_context_fmha)
                                                  cublas_wrapper_,
                                                  allocator_,
                                                  is_free_buffer_after_forward_,
-                                                  use_context_fmha);
+                                                  use_context_fmha,
+                                                  quant_policy);
    decoder_ = new LlamaDecoder<T>(head_num_,
                                   size_per_head_,
@@ -140,7 +152,8 @@ void LlamaV2<T>::initialize(bool use_context_fmha)
                                   stream_,
                                   cublas_wrapper_,
                                   allocator_,
-                                   is_free_buffer_after_forward_);
+                                   is_free_buffer_after_forward_,
+                                   quant_policy);
    dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
                                                          vocab_size_,  // vocab_size_padded,

--- a/src/fastertransformer/models/llama/LlamaV2.h
+++ b/src/fastertransformer/models/llama/LlamaV2.h
@@ -62,6 +62,7 @@ public:
            int                          end_id,
            int                          cache_max_entry_count,
            int                          cache_chunk_size,
+            int                          quant_policy,
            bool                         use_context_fmha,
            std::shared_ptr<SharedState> shared_state,
            LlamaWeight<T>*              weights,
@@ -88,7 +89,7 @@ private:
    void internalThreadEntry(int device_id);
-    void initialize(bool use_context_fmha);
+    void initialize(bool use_context_fmha, int quant_policy);
    void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);

--- a/src/fastertransformer/models/llama/llama_kernels.cu
+++ b/src/fastertransformer/models/llama/llama_kernels.cu
@@ -2,7 +2,9 @@
 #include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
 #include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
 #include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
 namespace fastertransformer {
@@ -283,6 +285,102 @@ __global__ void extend_value_cache(T**          v_dst,
    }
 }
+inline __device__ float2 float2div(float a, float2 b)
+{
+    float2 c;
+    c.x = b.x / a;
+    c.y = b.y / a;
+    return c;
+}
+static inline __device__ half4 char4_scale_to_half4(char4 value, const float scale) {
+  half4 dst;
+  dst.x = __float2half(value.x * scale);
+  dst.y = __float2half(value.y * scale);
+  dst.z = __float2half(value.z * scale);
+  dst.w = __float2half(value.w * scale);
+  return dst;
+}
+static inline __device__ uint32_t float4_to_char4(float x,
+                                                  float y,
+                                                  float z,
+                                                  float w) {
+  uint32_t dst;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
+  uint32_t a; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
+  uint32_t b; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
+  uint32_t c; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
+  uint32_t d; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
+  asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2,  0;\n" : "=r"(dst) : "r"(d), "r"(c));
+  asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a));
+#else
+  char4 tmp;
+  tmp.x = x;
+  tmp.y = y;
+  tmp.z = z;
+  tmp.w = w;
+  dst = reinterpret_cast<const uint32_t&>(tmp);
+#endif
+  return dst;
+}
+template<typename T>
+__global__ void extend_value_cache_int8(int8_t**     v_dst,
+                                        const size_t dst_offset,
+                                        const T*     v_src,
+                                        const int    head_num,
+                                        const int    size_per_head,
+                                        const int*   query_length,
+                                        const int*   history_length,
+                                        const int    max_q_len,
+                                        const int    max_seq_len,
+                                        const float  v_scale)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+    // x dim is now handled by uint4 type
+    const auto val_src = reinterpret_cast<const uint4*>(v_src);
+    const auto val_dst = reinterpret_cast<uint2*>(v_dst[batch_id] + dst_offset);
+    const auto seq_len  = query_length[batch_id];
+    const auto t_offset = history_length[batch_id];
+    const int v_head_size_id = idx % size_per_head_div_x;
+    const int v_seq_len_id   = idx / size_per_head_div_x;
+    if (v_seq_len_id < seq_len) {
+        // [B, H, s, D/x] -> [H, S[t:t+s], D/x]
+        const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len +      // H
+                                (v_seq_len_id + t_offset) * size_per_head_div_x +  // s + offset
+                                v_head_size_id;                                    // D/x
+        const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len +  // B
+                                head_id * size_per_head_div_x * max_q_len +              // H
+                                v_seq_len_id * size_per_head_div_x +                     // s
+                                v_head_size_id;                                          // D/x
+        // scale to int8 and write
+        const auto value  = val_src[src_idx];
+        auto       to_ptr = reinterpret_cast<uint32_t*>(val_dst + dst_idx);
+        float2 float2_0 = float2div(v_scale, mmha::half2_to_float2(value.x));
+        float2 float2_1 = float2div(v_scale, mmha::half2_to_float2(value.y));
+        to_ptr[0]       = float4_to_char4(float2_0.x, float2_0.y, float2_1.x, float2_1.y);
+        float2_0  = float2div(v_scale, mmha::half2_to_float2(value.z));
+        float2_1  = float2div(v_scale, mmha::half2_to_float2(value.w));
+        to_ptr[1] = float4_to_char4(float2_0.x, float2_0.y, float2_1.x, float2_1.y);
+    }
+}
 template<typename T>
 void invokeExtendKVCache(T**          k_dst,
                         T**          v_dst,
@@ -296,18 +394,29 @@ void invokeExtendKVCache(T**          k_dst,
                         int          max_seq_len,
                         int          size_per_head,
                         int          local_head_num,
-                         cudaStream_t stream)
+                         cudaStream_t stream,
+                         int          quant,
+                         const float* kv_scale)
 {
    constexpr int block_sz = 128;
    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
    dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
-    extend_value_cache<<<grid, block_sz, 0, stream>>>(
+    if (quant & QuantPolicy::kCacheKVInt8) {
-        k_dst, dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
+        extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(
+            reinterpret_cast<int8_t**>(k_dst), dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len, kv_scale[0]);
+        extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(
+            reinterpret_cast<int8_t**>(v_dst), dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len, kv_scale[1]);
+    } else {
+        extend_value_cache<<<grid, block_sz, 0, stream>>>(
+            k_dst, dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
-    extend_value_cache<<<grid, block_sz, 0, stream>>>(
+        extend_value_cache<<<grid, block_sz, 0, stream>>>(
-        v_dst, dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
+            v_dst, dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
+    }
 }
 template void invokeExtendKVCache(float**,
@@ -322,7 +431,9 @@ template void invokeExtendKVCache(float**,
                                  int,
                                  int,
                                  int,
-                                  cudaStream_t stream);
+                                  cudaStream_t stream,
+                                  int,
+                                  const float*);
 template void invokeExtendKVCache(half**,
                                  half**,
@@ -336,17 +447,19 @@ template void invokeExtendKVCache(half**,
                                  int,
                                  int,
                                  int,
-                                  cudaStream_t stream);
+                                  cudaStream_t stream,
+                                  int,
+                                  const float*);
 template<typename T>
-__global__ void transpose_key_cache(T*           k_dst,
+__global__ void transpose_value_cache(T*           v_dst,  //
-                                    const T**    k_src,
+                                      const T**    v_src,
-                                    const size_t src_offset,
+                                      const size_t src_offset,
-                                    const int    head_num,
+                                      const int    head_num,
-                                    const int    size_per_head,
+                                      const int    size_per_head,
-                                    const int*   seq_length,
+                                      const int*   seq_length,
-                                    const int    max_kv_len,
+                                      const int    max_kv_len,
-                                    const int    max_seq_len)
+                                      const int    max_seq_len)
 {
    const int     batch_id = blockIdx.y;
    const int     head_id  = blockIdx.z;
@@ -356,39 +469,40 @@ __global__ void transpose_key_cache(T*           k_dst,
    int       size_per_head_div_x = size_per_head / X_ELEMS;
    // x dim is now handled by uint4 type
-    const auto key_src = reinterpret_cast<const uint4*>(k_src[batch_id] + src_offset);
+    const auto val_src = reinterpret_cast<const uint4*>(v_src[batch_id] + src_offset);
-    const auto key_dst = reinterpret_cast<uint4*>(k_dst);
+    const auto val_dst = reinterpret_cast<uint4*>(v_dst);
    const auto seq_len = seq_length[batch_id];
-    const int k_head_size_id = idx % size_per_head_div_x;
+    const int v_head_size_id = idx % size_per_head_div_x;
-    const int k_seq_len_id   = idx / size_per_head_div_x;
+    const int v_seq_len_id   = idx / size_per_head_div_x;
-    if (k_seq_len_id < seq_len) {
-        // [B, H, s, D/x] <- [B, H, D/x, S[:s]]
+    if (v_seq_len_id < seq_len) {
+        // [B, H, s, D/x] <- [B, H, S[:s], D/x]
        const int64_t src_idx = head_id * size_per_head_div_x * max_seq_len +  // H
-                                k_head_size_id * max_seq_len +                 // D/x
+                                v_seq_len_id * size_per_head_div_x +           // s
-                                k_seq_len_id;                                  // s
+                                v_head_size_id;                                // D/x
        const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len +  // B
                                head_id * size_per_head_div_x * max_kv_len +              // H
-                                k_seq_len_id * size_per_head_div_x +                      // s
+                                v_seq_len_id * size_per_head_div_x +                      // s
-                                k_head_size_id;                                           // D/x
+                                v_head_size_id;                                           // D/x
-        key_dst[dst_idx] = key_src[src_idx];
+        val_dst[dst_idx] = val_src[src_idx];
    }
 }
 template<typename T>
-__global__ void transpose_value_cache(T*           v_dst,  //
+__global__ void transpose_value_cache_int8(T*           v_dst,  //
-                                      const T**    v_src,
+                                      const int8_t**    v_src,
                                      const size_t src_offset,
                                      const int    head_num,
                                      const int    size_per_head,
                                      const int*   seq_length,
                                      const int    max_kv_len,
-                                      const int    max_seq_len)
+                                      const int    max_seq_len,
+                                      const float  v_scale)
 {
    const int     batch_id = blockIdx.y;
    const int     head_id  = blockIdx.z;
@@ -398,7 +512,7 @@ __global__ void transpose_value_cache(T*           v_dst,  //
    int       size_per_head_div_x = size_per_head / X_ELEMS;
    // x dim is now handled by uint4 type
-    const auto val_src = reinterpret_cast<const uint4*>(v_src[batch_id] + src_offset);
+    const auto val_src = reinterpret_cast<const uint2*>(v_src[batch_id] + src_offset);
    const auto val_dst = reinterpret_cast<uint4*>(v_dst);
    const auto seq_len = seq_length[batch_id];
@@ -417,7 +531,12 @@ __global__ void transpose_value_cache(T*           v_dst,  //
                                v_seq_len_id * size_per_head_div_x +                      // s
                                v_head_size_id;                                           // D/x
-        val_dst[dst_idx] = val_src[src_idx];
+        // int8x8 -> fp16x8
+        const auto from_ptr = reinterpret_cast<const char4*>(val_src + src_idx);
+        auto to_ptr = reinterpret_cast<half4*>(val_dst + dst_idx);
+        to_ptr[0] = char4_scale_to_half4(from_ptr[0], v_scale);
+        to_ptr[1] = char4_scale_to_half4(from_ptr[1], v_scale);
    }
 }
@@ -433,24 +552,35 @@ void invokeTransposeKVCache(T*           key_cache_trans,
                            int          max_seq_len,
                            int          size_per_head,
                            int          head_num,
-                            cudaStream_t stream)
+                            cudaStream_t stream,
+                            int          quant,
+                            const float* kv_scale)
 {
    constexpr int block_sz = 128;
    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
    dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
-    transpose_value_cache<<<grid, block_sz, 0, stream>>>(
+    if (quant & QuantPolicy::kCacheKVInt8) {
-        key_cache_trans, key_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
+        transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(
+            key_cache_trans, reinterpret_cast<const int8_t**>(key_cache), src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len, kv_scale[0]);
-    transpose_value_cache<<<grid, block_sz, 0, stream>>>(
+        transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(
-        val_cache_trans, val_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
+            val_cache_trans, reinterpret_cast<const int8_t**>(val_cache), src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len, kv_scale[1]);
+    } else {
+        transpose_value_cache<<<grid, block_sz, 0, stream>>>(
+            key_cache_trans, key_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
+        transpose_value_cache<<<grid, block_sz, 0, stream>>>(
+            val_cache_trans, val_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
+    }
 }
 template void invokeTransposeKVCache(
-    float*, float*, const float**, const float**, size_t, int, const int*, int, int, int, int, cudaStream_t stream);
+    float*, float*, const float**, const float**, size_t, int, const int*, int, int, int, int, cudaStream_t stream, int, const float*);
 template void invokeTransposeKVCache(
-    half*, half*, const half**, const half**, size_t, int, const int*, int, int, int, int, cudaStream_t stream);
+    half*, half*, const half**, const half**, size_t, int, const int*, int, int, int, int, cudaStream_t stream, int, const float*);
 __global__ void gatherOutput(int*       output_ids,
                             const int* ids,

--- a/src/fastertransformer/models/llama/llama_kernels.h
+++ b/src/fastertransformer/models/llama/llama_kernels.h
@@ -46,7 +46,9 @@ void invokeExtendKVCache(T**          k_dst,
                         int          max_seq_len,
                         int          size_per_head,
                         int          local_head_num,
-                         cudaStream_t stream);
+                         cudaStream_t stream,
+                         int          quant,
+                         const float* kv_scale);
 template<typename T>
 void invokeTransposeKVCache(T*           key_cache_trans,
@@ -60,7 +62,9 @@ void invokeTransposeKVCache(T*           key_cache_trans,
                            int          max_seq_len,
                            int          size_per_head,
                            int          head_num,
-                            cudaStream_t stream);
+                            cudaStream_t stream,
+                            int          quant_policy,
+                            const float* kv_scale);
 void invokeGatherOutput(int*         output_ids,
                        const int*   ids,
@@ -151,25 +155,6 @@ struct TempBuffer {
    T* data;
 };
-template<typename T>
-inline T*
-transpose_key_cache(T* key_cache, size_t head_num, size_t size_per_head_by_x, size_t mem_len, size_t x, cudaStream_t st)
-{
-    static TempBuffer<T> buf(8192 * 8192);
-    // from: H Dx, S, x
-    // to  : S, H Dx, x
-    invokeTransposeAxis01(buf.data, key_cache, head_num * size_per_head_by_x, mem_len, x, st);
-    return buf.data;
-}
-template<typename T>
-inline T* transpose_value_cache(T* value_cache, size_t head_num, size_t mem_len, size_t size_per_head, cudaStream_t st)
-{
-    static TempBuffer<T> buf(8192 * 8192);
-    invokeTransposeAxis01(buf.data, value_cache, head_num, mem_len, size_per_head, st);
-    return buf.data;
-}
 inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_t st)
 {
    int h_seq_len = -1;